目录
Per-cpu重要数据结构
struct pcpu_alloc_info | |
size_t static_size | 用于保存静态定义变量的空间大小 |
size_t reserved_size | 预留区域大小,如果指定在reserved区域分配就使用这里的内存 |
size_t dyn_size | 动态分配所需空间大小 |
size_t unit_size | 每颗处理器的percpu虚拟内存递进基本单位,前面三个之和 |
size_t atom_size | 用于对齐一般为page size |
size_t alloc_size | 实际分配的percpu虚拟内存大小,一般与unit_size相等 |
size_t __ai_size | 管理结构体所占用空间大小,包含pcpu_alloc_info,nr_groups个group和nr_units个cpu_map |
int nr_groups | 处理器分组数 |
struct pcpu_group_info groups[] | 组信息 |
struct pcpu_group_info | |
int nr_units | 组内单元个数,一般为组内cpu个数 |
unsigned long base_offset | 组起始地址相对于percpu全局基地址的偏移 |
unsigned int *cpu_map | 组内cpu的cpu id号 |
struct pcpu_chunk | |
struct list_head list | 用于链接到pcpu_slot链表中 |
int contig_hint | Chunk内部最大连续区间大小 |
void *base_addr | 基地址 |
int map_used | 已经使用的map个数 |
int map_alloc | 总共分配的map个数 |
int *map | Map数组 |
void *data | 存放chunk内存的page指针 |
int first_free | 缓存第一个空闲map |
bool immutable | 为true表示已经分配物理内存 |
int nr_populated | 已经分配的物理内存页数 |
unsigned long populated[] | 位域,用于标记某个位置上是否有分配物理页,1表示有物理页,反之没有 |
Per-cpu Setup
2.1 Per-cpu软件框架建立流程
start_kernel---->setup_per_cpu_areas
pcpu_build_alloc_info:创建per cpu管理结构pcpu_alloc_info,传入参数reserved_size, dyn_size, atom_size, 这三个参数分别为,预留空间大小,用于动态分配空间打下,对齐大小。
pcpu_setup_first_chunk:创建第一个chunk,传入参数为前面分配的pcpu_alloc_info与前面为第一个group 分配的内存base。函数中将根据pcpu_alloc_info中成员初始化chunk,并设置chunk->base_addr为传 入的base。
2.2 pcpu_alloc_info创建
结构pcpu_alloc_info保存了per-cpu相关的管理信息。函数pcpu_build_alloc_info就是用于创建该结构并初始化其中成员。函数传入参数依次为保留区间大小,用于动态分配的空间大小,以及对齐大小,最后一个传入的是NULL。
static struct pcpu_alloc_info * __init pcpu_build_alloc_info(
size_t reserved_size, size_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;//用于标记每个cpu所属的group
static int group_cnt[NR_CPUS] __initdata; //统计每个组包含的cpu数,最多NR_CPUS个group
//静态定义的per-cpu变量都存放在.data..percpu这个节中,static_size就是这个节的大小
const size_t static_size = __per_cpu_end - __per_cpu_start;
int nr_groups = 1, nr_units = 0; //group至少有一个
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
unsigned int *cpu_map;
memset(group_map, 0, sizeof(group_map));
memset(group_cnt, 0, sizeof(group_cnt));
/*计算一个per-cpu最小单元的内存大小,一个per-cpu内存最小单元包含存放静态定义数据的大小加 上保留区间大小加上用于动态分配区间大小,PERCPU_DYNAMIC_EARLY_SIZE (12 << 10) ,动态分配 区间最少12K*/
size_sum = PFN_ALIGN(static_size + reserved_size +
max_t(size_t, dyn_size, PERCPU_DYNAMIC_EARLY_SIZE));
dyn_size = size_sum - static_size - reserved_size;
/* #define PCPU_MIN_UNIT_SIZE PFN_ALIGN(32 << 10),最小per-cpu单元至少32K */
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, atom_size); //将最小分配单元对齐到atom_size
//位简单起见我们认为min_unit_size就是页atom_size对齐的所以这里upa为1
upa = alloc_size / min_unit_size;
while (alloc_size % upa || (offset_in_page(alloc_size / upa)))
upa--;
max_upa = upa;
/*遍历每个cpu,将每个cpu映射到其对应的group,前面讲了参数cpu_distance_fn为NULL,所以这 里只有一个group,group即是一组CPU的集合*/
for_each_possible_cpu(cpu) {
group = 0;
next_group:
for_each_possible_cpu(tcpu) {
if (cpu == tcpu)
break;
if (group_map[tcpu] == group && cpu_distance_fn &&
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
group++;
nr_groups = max(nr_groups, group + 1);
goto next_group;
}
}
group_map[cpu] = group; //每个cpu都是group0
group_cnt[group]++;//只有一个group,group0的cpu个数就等于cpu数
}
…………根据upa和group数计算可能浪费掉的内存空间,我们都按最简单来计算所以这里省略………
upa = best_upa;
// nr_groups为1,nr_units就是cpu个数
for (group = 0; group < nr_groups; group++)
nr_units += roundup(group_cnt[group], upa);
//为结构体pcpu_alloc_info和pcpu_group_info以及cpu_map分配空间,后面细将
ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
if (!ai)
return ERR_PTR(-ENOMEM);
cpu_map = ai->groups[0].cpu_map;
// nr_groups为1,
for (group = 0; group < nr_groups; group++) {
ai->groups[group].cpu_map = cpu_map;
cpu_map += roundup(group_cnt[group], upa);
}
/* unit_size = static_size + reserved_size + dyn_size. log如下:
PERCPU: Embedded 10 pages/cpu @dfb72000 s8320 r8192 d24448 u40960*/
ai->static_size = static_size;
ai->reserved_size = reserved_size;
ai->dyn_size = dyn_size;
ai->unit_size = alloc_size / upa;
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
//循环初始化gi->cpu_map ,将cpu映射到对应group,这里group只有一个,gi->nr_units就等于总的cpu数
for (group = 0, unit = 0; group_cnt[group]; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
gi->base_offset = unit * ai->unit_size;
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
gi->cpu_map[gi->nr_units++] = cpu;
gi->nr_units = roundup(gi->nr_units, upa);
unit += gi->nr_units;
}
BUG_ON(unit != nr_units);
return ai;
}
2.3 pcpu_first_chunk创建
下面函数用于创建第一个pcpu_chunk,pcpu_chunk是管理per-cpu动态分配的重要结构。下面函数传入的两个参数依次为前面分配的pcpu_alloc_info结构,和在函数pcpu_embed_first_chunk中分配的per cpu内存基地址 (ptr = alloc_fn(cpu, gi->nr_units * ai->unit_size, atom_size);)。
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
{
//per cpu内存空闲内存与已分配内存有用map分段管理 #define PERCPU_DYNAMIC_EARLY_SLOTS 128
static int smap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
static int dmap[PERCPU_DYNAMIC_EARLY_SLOTS] __initdata;
size_t dyn_size = ai->dyn_size; //用于动态分配的内存空间大小
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size; //一个最小per cpu单元大小
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned long *group_offsets;
size_t *group_sizes;
unsigned long *unit_off;
unsigned int cpu;
int *unit_map;
int group, unit, i;
//记录每个group的起始位置相对于base的偏移
group_offsets = memblock_virt_alloc(ai->nr_groups *
sizeof(group_offsets[0]), 0);
// 记录每个group占用的内存大小:gi->nr_units * ai->unit_size
group_sizes = memblock_virt_alloc(ai->nr_groups *
sizeof(group_sizes[0]), 0);
//将每个cpu映射到一个unit编号
unit_map = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_map[0]), 0);
/*每个cpu相对于base的偏移,计算方法为gi->base_offset + i * ai->unit_size。gi->base_offset在函数 pcpu_embed_first_chunk中为每个group分配内存的时候确定了下来。*/
unit_off = memblock_virt_alloc(nr_cpu_ids * sizeof(unit_off[0]), 0);
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = UINT_MAX;
pcpu_low_unit_cpu = NR_CPUS;
pcpu_high_unit_cpu = NR_CPUS;
//下面为一个双循环,外循环记录每个group的偏移,内循环记录每个cpu的偏移
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
group_offsets[group] = gi->base_offset;
group_sizes[group] = gi->nr_units * ai->unit_size;
for (i = 0; i < gi->nr_units; i++) {
cpu = gi->cpu_map[i];
if (cpu == NR_CPUS)
continue;
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
}
}
pcpu_nr_units = unit;
//dump结构pcpu_alloc_info中的内容:pcpu-alloc: s8320 r8192 d24448 u40960 alloc=10*4096
pcpu_dump_alloc_info(KERN_DEBUG, ai);
//设置几个全局变量
pcpu_nr_groups = ai->nr_groups; //总的group数
pcpu_group_offsets = group_offsets; //每个group相对于base的偏移
pcpu_group_sizes = group_sizes; //记录每个group占用的空间大小
pcpu_unit_map = unit_map; // 记录每个cpu映射到unit的编号
pcpu_unit_offsets = unit_off; //记录每个cpu相对于base的偏移
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT; //一个unit_size即是一个cpu占用的大小
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_atom_size = ai->atom_size;
//分配pcpu_chunk和对应的map数组,pcpu_chunk中每个page对应一个bit,为1表示空闲反之占用
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
/*pcpu_slot是一个数组,每个成员都是一个连表头,每个chunk根据free_size大小链接到数组对应的链表头下*/
pcpu_slot = memblock_virt_alloc(
pcpu_nr_slots * sizeof(pcpu_slot[0]), 0);
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
//分配schunk用于管理静态保存变量的per cpu空间
schunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&schunk->list);
INIT_LIST_HEAD(&schunk->map_extend_list);
schunk->base_addr = base_addr; //设置基地址
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap); //总的map数
schunk->immutable = true;
//将位域schunk->populated中pcpu_unit_pages个page区间都设置为1
bitmap_fill(schunk->populated, pcpu_unit_pages);
schunk->nr_populated = pcpu_unit_pages;
//设置schunk->free_size
if (ai->reserved_size) {
//如果ai->reserved_size不为0,reserved_size就作为schunk的free_size
schunk->free_size = ai->reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
} else {//如果ai->reserved_size为0,dyn_size就作为schunk的free_size,dyn_size设为0
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
/*chunk是分段管理的,map的最低位为1表示该段区间有可用空间,map中除了最低位之外记录的是chunk起始地址到该段区间的大小,具体在后面讲分配的时候细讲*/
schunk->map[0] = 1;
schunk->map[1] = ai->static_size; //最低位为0表示该段区间没有可用内存
schunk->map_used = 1;
if (schunk->free_size)
schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
schunk->map[schunk->map_used] |= 1; //有可用内存
if (dyn_size) {
//如果dyn_size不为0也就是reserved_size不为0,这里就创建一个dchunk,用于管理动态分配区
dchunk = memblock_virt_alloc(pcpu_chunk_struct_size, 0);
INIT_LIST_HEAD(&dchunk->list);
INIT_LIST_HEAD(&dchunk->map_extend_list);
dchunk->base_addr = base_addr;//设置基地址
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
dchunk->immutable = true;
bitmap_fill(dchunk->populated, pcpu_unit_pages);
dchunk->nr_populated = pcpu_unit_pages;
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[0] = 1;
dchunk->map[1] = pcpu_reserved_chunk_limit; //将static区域和reserved区域设置为已经分配
dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1; //有可用区域
dchunk->map_used = 2;
}
pcpu_first_chunk = dchunk ?: schunk; //设置first chunk
pcpu_nr_empty_pop_pages +=
pcpu_count_occupied_pages(pcpu_first_chunk, 1);
// 将first chunk根据free_size 添加到数组pcpu_slot[nslot]中对应的连表头之下
pcpu_chunk_relocate(pcpu_first_chunk, -1);
//设置per- cpu全局的基地址
pcpu_base_addr = base_addr;
return 0;
}
前面chunk->map是静态定义的数组,因为当时slub机制还没有建立起来不能进行小块内存分配。等slub建立起来之后在函数percpu_init_late中为chunk->map重新分配内存。
start_kernel----> mm_init---->percpu_init_late
void __init percpu_init_late(void)
{
struct pcpu_chunk *target_chunks[] =
{ pcpu_first_chunk, pcpu_reserved_chunk, NULL };
struct pcpu_chunk *chunk;
unsigned long flags;
int i;
for (i = 0; (chunk = target_chunks[i]); i++) {
int *map;
const size_t size = PERCPU_DYNAMIC_EARLY_SLOTS * sizeof(map[0]);
map = pcpu_mem_zalloc(size); //通过kmalloc重新分配map
spin_lock_irqsave(&pcpu_lock, flags);
memcpy(map, chunk->map, size); //将之前的map拷贝到新的map数组中
chunk->map = map;
spin_unlock_irqrestore(&pcpu_lock, flags);
}
}
2.4 设置__per_cpu_offset
void __init setup_per_cpu_areas(void)
{
unsigned long delta;
unsigned int cpu;
int rc;
/* #define PERCPU_MODULE_RESERVE (8 << 10)
#define PERCPU_DYNAMIC_RESERVE (28 << 10)
下面函数具体工作在前面讲过*/
rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE, PAGE_SIZE, NULL,
pcpu_dfl_fc_alloc, pcpu_dfl_fc_free);
if (rc < 0)
panic("Failed to initialize percpu areas.");
/*pcpu_base_addr保存per-cpu内存基地址,第一个group的起始地址, __per_cpu_start是.data..percpu节的起始地址,pcpu_unit_offsets[]保存的是每个cpu对应内存区相对于pcpu_base_addr的偏移。但是per cpu变量的访问时以__per_cpu_start为基地址的, 所以这里重新计算每个cpu内存到__per_cpu_start的偏移,并保存到数组__per_cpu_offset[cpu]中。*/
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
pcpu_alloc
3.1 per-cpu变量分配流程
1)根据size的最高bit算出slot,到pcpu_slot[slot]中以及更高阶的slot中找一个合适chunk
2)从前面选定的chunk中找到一个合适map,尝试分配到需要的需要的内存
3)分配到的size可能在map中间,就出现了head和tail,为这里的head和tail建立map
4)将chunk根据chunk->free_size移动到对应的pcpu_slot[slot]中
per-cpu结构关系:
下面是一个尝试从chunk->map[5](i==5)分配size大小的内存空间例子,假设是原子分配:
分配之后为前后碎片新建map如下:
分配完成之后:
3.2 选定可用chunk
函数pcpu_alloc 是per-cpu动态分配的接口。分配大小为size,对齐align,如果reserved为true就尝试从 pcpu_reserved_chunk中分配。
static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
gfp_t gfp)
{
static int warn_limit = 10;
struct pcpu_chunk *chunk;
const char *err;
bool is_atomic = (gfp & GFP_KERNEL) != GFP_KERNEL;
int occ_pages = 0;
int slot, off, new_alloc, cpu, ret;
unsigned long flags;
void __percpu *ptr;
……
size = ALIGN(size, 2);
//如果reserved为true,而且pcpu_reserved_chunk不为NULL就尝试从pcpu_reserved_chunk中分配
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
if (size > chunk->contig_hint) { //如果size大于chunk中最大的连续空间就返回分配失败
err = "alloc from reserved chunk failed";
goto fail_unlock;
}
/* chunk->map_alloc为可用的map,chunk->map_used是已经使用的map,根据这两个字段来判断如果chunk->map[]不足,就通过调用pcpu_extend_area_map来扩展chunk->map*/
while ((new_alloc = pcpu_need_to_extend(chunk, is_atomic))) {
spin_unlock_irqrestore(&pcpu_lock, flags);
if (is_atomic ||
pcpu_extend_area_map(chunk, new_alloc) < 0) {
err = "failed to extend area map of reserved chunk";
goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
}
//尝试从执行chunk中分配内存
off = pcpu_alloc_area(chunk, size, align, is_atomic,
&occ_pages);
if (off >= 0)
goto area_found;
goto fail_unlock;
}
restart:
/* 根据size的最高bit算出slot,到pcpu_slot[slot]以及更高阶slot中去查找可用的chunk,然后到选定的chunk中去尝试分配per-cpu内存 */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
/*pcpu_slot[slot]是一个链表头,挂的是chunk->free_size为2^slot到2^slot+1的chunk(不完全是这样,可以这样理解)*/
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint) //如果最大连续空间不足以分配size大小的内存就跳过
continue;
//检查是否需要扩展map,初始map大小为128
new_alloc = pcpu_need_to_extend(chunk, is_atomic);
if (new_alloc) {
if (is_atomic)
continue;
spin_unlock_irqrestore(&pcpu_lock, flags);
if (pcpu_extend_area_map(chunk,
new_alloc) < 0) { //扩展map
err = "failed to extend area map";
goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
goto restart;
}
//尝试从选定的chunk中分配内存
off = pcpu_alloc_area(chunk, size, align, is_atomic,
&occ_pages);
if (off >= 0)
goto area_found;
}
}
spin_unlock_irqrestore(&pcpu_lock, flags);
if (is_atomic) //如果是原子分配就返回,后面创建chunk可能导致睡眠
goto fail;
if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
/*创建一个新的chunk,有两种创建的方式,1,直接通过alloc_pages分配所需的所有页;2,在vmalloc区间先分配到一块连续的虚拟地址空间,在使用的时候再分配物理页 */
chunk = pcpu_create_chunk();
if (!chunk) {
err = "failed to allocate new chunk";
goto fail;
}
spin_lock_irqsave(&pcpu_lock, flags);
pcpu_chunk_relocate(chunk, -1); //将新创建的chunk添加到pcpu_slot[slot]中
} else {
spin_lock_irqsave(&pcpu_lock, flags);
}
goto restart;
area_found:
spin_unlock_irqrestore(&pcpu_lock, flags);
if (!is_atomic) {
//如果不是原子分配就将page_start到page_end区间没有分配物理内存的部分补充物理页
int page_start, page_end, rs, re;
page_start = PFN_DOWN(off);
page_end = PFN_UP(off + size);
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
ret = pcpu_populate_chunk(chunk, rs, re); //为rs到re区间分配物理页
spin_lock_irqsave(&pcpu_lock, flags);
if (ret) {
pcpu_free_area(chunk, off, &occ_pages);
err = "failed to populate";
goto fail_unlock;
}
pcpu_chunk_populated(chunk, rs, re);//将chunk->populated对应的bit置1
spin_unlock_irqrestore(&pcpu_lock, flags);
}
mutex_unlock(&pcpu_alloc_mutex);
}
……
for_each_possible_cpu(cpu) //将每个cpu分配到的size空间清零
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
/*返回分配到per-cpu变量空间地址,地址具体计算后面再讲*/
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
return ptr;
fail_unlock:
spin_unlock_irqrestore(&pcpu_lock, flags);
fail:
.......
return NULL;
}
3.3 从选定chunk中分配内存
函数pcpu_alloc_area 用于在指定chunk中分配大小为size的空间,is_atomic为true表示分配到的空间必须存已经分配物理内存,否则不用,只要有够大的空间就行,后面再补充物理内存。
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align,
bool pop_only, int *occ_pages_p)
{
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
int i, off;
bool seen_free = false;
int *p;
//循环从chunk中选择一个合适的map
for (i = chunk->first_free, p = chunk->map + i; i < chunk->map_used; i++, p++) {
int head, tail;
int this_size;
off = *p;
if (off & 1)
continue;
this_size = (p[1] & ~1) - off;
//从map中选择一个合适的区间,如果pop_only为true表示选择的区间必须要有物理页,这个区间可能存在map的中间。
head = pcpu_fit_in_area(chunk, off, this_size, size, align,
pop_only);
if (head < 0) {
if (!seen_free) {
chunk->first_free = i; //缓存第一个可用map,方便下次快速查找
seen_free = true;
}
max_contig = max(this_size, max_contig); //更新最大连续区间
continue;
}
//如果head太小或者前一个map为free就合并到前一个
if (head && (head < sizeof(int) || !(p[-1] & 1))) {
*p = off += head;
if (p[-1] & 1)
chunk->free_size -= head;
else
max_contig = max(*p - p[-1], max_contig); //更新最大连续区间
this_size -= head;
head = 0;
}
tail = this_size - head - size; //计算当前map中还剩余的空间
if (tail < sizeof(int)) { //如果剩余太少就合并到这次分配中
tail = 0;
size = this_size - head;
}
if (head || tail) {
int nr_extra = !!head + !!tail; //根据头尾是否剩余预留对应map
//将p+1之后的map后移nr_extra个位值,腾出空间用于head/ tail的map
memmove(p + nr_extra + 1, p + 1,
sizeof(chunk->map[0]) * (chunk->map_used - i));
chunk->map_used += nr_extra; //更新map使用情况
if (head) {
if (!seen_free) {
chunk->first_free = i;
seen_free = true;
}
*++p = off += head; //记录head这个map
++i;
max_contig = max(head, max_contig);
}
if (tail) {
p[1] = off + size; //记录tail这个map
max_contig = max(tail, max_contig);
}
}
if (!seen_free)
chunk->first_free = i + 1;
if (i + 1 == chunk->map_used)
chunk->contig_hint = max_contig; /* fully scanned */
else
chunk->contig_hint = max(chunk->contig_hint,
max_contig); //更新最大连续空间
chunk->free_size -= size; //更新剩余空闲空间
*p |= 1; //标记为被占用
*occ_pages_p = pcpu_count_occupied_pages(chunk, i);
pcpu_chunk_relocate(chunk, oslot);
return off;
}
chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot); //将chunk重新插入pcpu_slot[slot]
return -1;
}
3.4创建新的chunk
static struct pcpu_chunk *pcpu_create_chunk(void)
{
const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
struct pcpu_chunk *chunk;
struct page *pages;
int i;
//分配pcpu_chunk 结构和chunk->map
chunk = pcpu_alloc_chunk();
if (!chunk)
return NULL;
//分配所需的物理页
pages = alloc_pages(GFP_KERNEL, order_base_2(nr_pages));
if (!pages) {
pcpu_free_chunk(chunk);
return NULL;
}
//让每个页的page->index指向前面分配的chunk
for (i = 0; i < nr_pages; i++)
pcpu_set_page_chunk(nth_page(pages, i), chunk);
chunk->data = pages; //指向分配的page结构
//将chunk->base_addr设置为实际地址减去第一个group起始地址的偏移,后面再细讲
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
spin_lock_irq(&pcpu_lock);
pcpu_chunk_populated(chunk, 0, nr_pages); //将chunk->populated[]对应的bit置1
spin_unlock_irq(&pcpu_lock);
return chunk;
}
3.5 per-cpu地址计算
per-cpu地址计算涉及到下面几个全局变量:
pcpu_base_addr:在函数pcpu_embed_first_chunk中会为每个group分配内存,将返回的最小的那个地址作
为全局的base。
pcpu_group_offsets[group] :记录每个group内存空间起始地址相对于pcpu_base_addr的偏移
pcpu_unit_offsets[cpu]:记录每个cpu内存空间相对于pcpu_base_addr的偏移
__per_cpu_start:静态per-cpu变量保存的节.data..percpu,的起始地址,per-cpu管理机制建立起来之后这 个节的内容会被重新拷贝到schunk中。
__per_cpu_offset[cpu]:记录的是__per_cpu_start 到pcpu_base_addr加上pcpu_unit_offsets[cpu]的总偏移
chunk的基地址设置如下,由于我们只有一个group,所以pcpu_group_offsets[0]为0,也就是说chunk->base_addr保存的也就是实际的虚拟地址:
chunk->base_addr = page_address(pages) - pcpu_group_offsets[0];
分配per-cpu内存的时候返回地址如下
ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
#define __addr_to_pcpu_ptr(addr) \
(void __percpu *)((unsigned long)(addr) - \
(unsigned long)pcpu_base_addr + \
(unsigned long)__per_cpu_start)
也就是ptr = 实际基地址 - (pcpu_base_addr - __per_cpu_start)。
per-cpu变量地址获取实例
static void __meminit zone_pageset_init(struct zone *zone, int cpu)
{
struct per_cpu_pageset *pcp = per_cpu_ptr(zone->pageset, cpu);
……
}
per_cpu_ptr是一个宏,其实现如下:
include/asm-generic /percpu.h
#define per_cpu_ptr(ptr, cpu) \
({ \
__verify_pcpu_ptr(ptr); \
SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))); \
})
#define per_cpu_offset(x) (__per_cpu_offset[x])
根据前面介绍我们知道:
__per_cpu_offset[cpu] = pcpu_base_addr - __per_cpu_start + pcpu_unit_offsets[cpu];
per cpu变量zone->pageset的地址并非实际的虚拟地址,在动态分配的时候是经过处理的,我们定义为alloc_ptr:
alloc_ptr = 实际基地址 - (pcpu_base_addr - __per_cpu_start)
宏per_cpu_offset就是为了返回实际的虚拟地址,其计算如下:
zone->pageset[cpu] = alloc_ptr + __per_cpu_offset[cpu]
= 实际虚拟地址 - (pcpu_base_addr - __per_cpu_start)
+ (pcpu_base_addr - __per_cpu_start) + pcpu_unit_offsets[cpu]
=实际虚拟地址 + pcpu_unit_offsets[cpu]
这就得到了对应cpu可访问的per-cpu地址了