Newer
Older
chunk = kzalloc(pcpu_chunk_struct_size, GFP_KERNEL);
if (!chunk)
return NULL;
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
}
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
return chunk;
}
/**
Tejun Heo
committed
* pcpu_alloc - the percpu allocator
* @align: alignment of area (max PAGE_SIZE)
Tejun Heo
committed
* @reserved: allocate from the reserved chunk if available
* Allocate percpu area of @size bytes aligned at @align.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
Tejun Heo
committed
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
struct pcpu_chunk *chunk;
int slot, off;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
return NULL;
}
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
Tejun Heo
committed
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
if (size > chunk->contig_hint ||
pcpu_extend_area_map(chunk) < 0)
goto fail_unlock;
Tejun Heo
committed
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
goto fail_unlock;
Tejun Heo
committed
}
restart:
Tejun Heo
committed
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
continue;
switch (pcpu_extend_area_map(chunk)) {
case 0:
break;
case 1:
goto restart; /* pcpu_lock dropped, restart */
default:
goto fail_unlock;
}
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
}
}
/* hmmm... no space left, create a new chunk */
spin_unlock_irq(&pcpu_lock);
chunk = alloc_pcpu_chunk();
if (!chunk)
goto fail_unlock_mutex;
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
goto restart;
spin_unlock_irq(&pcpu_lock);
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
spin_lock_irq(&pcpu_lock);
goto fail_unlock;
mutex_unlock(&pcpu_alloc_mutex);
/* return address relative to base address */
return __addr_to_pcpu_ptr(chunk->base_addr + off);
fail_unlock:
spin_unlock_irq(&pcpu_lock);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
return NULL;
Tejun Heo
committed
/**
* __alloc_percpu - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
Tejun Heo
committed
/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align from reserved
* percpu area if arch has set it up; otherwise, allocation is served
* from the same dynamic area. Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true);
}
/**
* pcpu_reclaim - reclaim fully free chunks, workqueue function
* @work: unused
*
* Reclaim all fully free chunks except for the first one.
*
* CONTEXT:
* workqueue context.
*/
static void pcpu_reclaim(struct work_struct *work)
LIST_HEAD(todo);
struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */
if (chunk == list_first_entry(head, struct pcpu_chunk, list))
continue;
list_move(&chunk->list, &todo);
}
spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr.
*
* CONTEXT:
* Can be called from atomic context.
*/
void free_percpu(void *ptr)
{
void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
if (!ptr)
return;
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
pcpu_free_area(chunk, off);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_size == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
schedule_work(&pcpu_reclaim_work);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
EXPORT_SYMBOL_GPL(free_percpu);
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
{
size_t size_sum;
size_sum = PFN_ALIGN(static_size + reserved_size +
(*dyn_sizep >= 0 ? *dyn_sizep : 0));
if (*dyn_sizep != 0)
*dyn_sizep = size_sum - static_size - reserved_size;
return size_sum;
}
/**
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
* pcpu_alloc_alloc_info - allocate percpu allocation info
* @nr_groups: the number of groups
* @nr_units: the number of units
*
* Allocate ai which is large enough for @nr_groups groups containing
* @nr_units units. The returned ai's groups[0].cpu_map points to the
* cpu_map array which is long enough for @nr_units and filled with
* NR_CPUS. It's the caller's responsibility to initialize cpu_map
* pointer of other groups.
*
* RETURNS:
* Pointer to the allocated pcpu_alloc_info on success, NULL on
* failure.
*/
struct pcpu_alloc_info * __init pcpu_alloc_alloc_info(int nr_groups,
int nr_units)
{
struct pcpu_alloc_info *ai;
size_t base_size, ai_size;
void *ptr;
int unit;
base_size = ALIGN(sizeof(*ai) + nr_groups * sizeof(ai->groups[0]),
__alignof__(ai->groups[0].cpu_map[0]));
ai_size = base_size + nr_units * sizeof(ai->groups[0].cpu_map[0]);
ptr = alloc_bootmem_nopanic(PFN_ALIGN(ai_size));
if (!ptr)
return NULL;
ai = ptr;
ptr += base_size;
ai->groups[0].cpu_map = ptr;
for (unit = 0; unit < nr_units; unit++)
ai->groups[0].cpu_map[unit] = NR_CPUS;
ai->nr_groups = nr_groups;
ai->__ai_size = PFN_ALIGN(ai_size);
return ai;
}
/**
* pcpu_free_alloc_info - free percpu allocation info
* @ai: pcpu_alloc_info to free
*
* Free @ai which was allocated by pcpu_alloc_alloc_info().
*/
void __init pcpu_free_alloc_info(struct pcpu_alloc_info *ai)
{
free_bootmem(__pa(ai), ai->__ai_size);
}
/**
* pcpu_build_alloc_info - build alloc_info considering distances between CPUs
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @atom_size: allocation atom size
* @cpu_distance_fn: callback to determine distance between cpus, optional
*
* This function determines grouping of units, their mappings to cpus
* and other parameters considering needed percpu size, allocation
* atom size and distances between CPUs.
*
* Groups are always mutliples of atom size and CPUs which are of
* LOCAL_DISTANCE both ways are grouped together and share space for
* units in the same group. The returned configuration is guaranteed
* to have CPUs on different nodes on different groups and >=75% usage
* of allocated virtual address space.
*
* RETURNS:
* On success, pointer to the new allocation_info is returned. On
* failure, ERR_PTR value is returned.
*/
struct pcpu_alloc_info * __init pcpu_build_alloc_info(
size_t reserved_size, ssize_t dyn_size,
size_t atom_size,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int group_cnt_max = 0, nr_groups = 1, nr_units = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs, group, unit;
unsigned int cpu, tcpu;
struct pcpu_alloc_info *ai;
unsigned int *cpu_map;
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of atom_size and is the smallest
* which can accomodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, atom_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
upa--;
max_upa = upa;
/* group cpus according to their proximity */
for_each_possible_cpu(cpu) {
group = 0;
next_group:
for_each_possible_cpu(tcpu) {
if (cpu == tcpu)
break;
if (group_map[tcpu] == group && cpu_distance_fn &&
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
group++;
nr_groups = max(nr_groups, group + 1);
goto next_group;
}
}
group_map[cpu] = group;
group_cnt[group]++;
group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
* Expand unit size until address space usage goes over 75%
* and then as much as possible without using more address
* space.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
continue;
for (group = 0; group < nr_groups; group++) {
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/*
* Don't accept if wastage is over 25%. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;
/* and then don't consume more memory */
if (allocs > last_allocs)
break;
last_allocs = allocs;
best_upa = upa;
}
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
upa = best_upa;
/* allocate and fill alloc_info */
for (group = 0; group < nr_groups; group++)
nr_units += roundup(group_cnt[group], upa);
ai = pcpu_alloc_alloc_info(nr_groups, nr_units);
if (!ai)
return ERR_PTR(-ENOMEM);
cpu_map = ai->groups[0].cpu_map;
for (group = 0; group < nr_groups; group++) {
ai->groups[group].cpu_map = cpu_map;
cpu_map += roundup(group_cnt[group], upa);
}
ai->static_size = static_size;
ai->reserved_size = reserved_size;
ai->dyn_size = dyn_size;
ai->unit_size = alloc_size / upa;
ai->atom_size = atom_size;
ai->alloc_size = alloc_size;
for (group = 0, unit = 0; group_cnt[group]; group++) {
struct pcpu_group_info *gi = &ai->groups[group];
/*
* Initialize base_offset as if all groups are located
* back-to-back. The caller should update this to
* reflect actual allocation.
*/
gi->base_offset = unit * ai->unit_size;
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
gi->cpu_map[gi->nr_units++] = cpu;
gi->nr_units = roundup(gi->nr_units, upa);
unit += gi->nr_units;
}
BUG_ON(unit != nr_units);
}
/**
* pcpu_dump_alloc_info - print out information about pcpu_alloc_info
* @lvl: loglevel
* @ai: allocation info to dump
*
* Print out information about @ai using loglevel @lvl.
*/
static void pcpu_dump_alloc_info(const char *lvl,
const struct pcpu_alloc_info *ai)
{
int group_width = 1, cpu_width = 1, width;
char empty_str[] = "--------";
int alloc = 0, alloc_end = 0;
int group, v;
int upa, apl; /* units per alloc, allocs per line */
v = ai->nr_groups;
while (v /= 10)
group_width++;
v = num_possible_cpus();
while (v /= 10)
cpu_width++;
empty_str[min_t(int, cpu_width, sizeof(empty_str) - 1)] = '\0';
upa = ai->alloc_size / ai->unit_size;
width = upa * (cpu_width + 1) + group_width + 3;
apl = rounddown_pow_of_two(max(60 / width, 1));
printk("%spcpu-alloc: s%zu r%zu d%zu u%zu alloc=%zu*%zu",
lvl, ai->static_size, ai->reserved_size, ai->dyn_size,
ai->unit_size, ai->alloc_size / ai->atom_size, ai->atom_size);
for (group = 0; group < ai->nr_groups; group++) {
const struct pcpu_group_info *gi = &ai->groups[group];
int unit = 0, unit_end = 0;
BUG_ON(gi->nr_units % upa);
for (alloc_end += gi->nr_units / upa;
alloc < alloc_end; alloc++) {
if (!(alloc % apl)) {
printk("\n");
printk("%spcpu-alloc: ", lvl);
}
printk("[%0*d] ", group_width, group);
for (unit_end += upa; unit < unit_end; unit++)
if (gi->cpu_map[unit] != NR_CPUS)
printk("%0*d ", cpu_width,
gi->cpu_map[unit]);
else
printk("%s ", empty_str);
}
}
printk("\n");
}
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @ai: pcpu_alloc_info describing how to percpu area is shaped
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
* @ai contains all information necessary to initialize the first
* chunk and prime the dynamic percpu allocator.
*
* @ai->static_size is the size of static percpu area.
*
* @ai->reserved_size, if non-zero, specifies the amount of bytes to
Tejun Heo
committed
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @ai->dyn_size determines the number of bytes available for dynamic
* allocation in the first chunk. The area between @ai->static_size +
* @ai->reserved_size + @ai->dyn_size and @ai->unit_size is unused.
* @ai->unit_size specifies unit size and must be aligned to PAGE_SIZE
* and equal to or larger than @ai->static_size + @ai->reserved_size +
* @ai->dyn_size.
*
* @ai->atom_size is the allocation atom size and used as alignment
* for vm areas.
*
* @ai->alloc_size is the allocation size and always multiple of
* @ai->atom_size. This is larger than @ai->atom_size if
* @ai->unit_size is larger than @ai->atom_size.
*
* @ai->nr_groups and @ai->groups describe virtual memory layout of
* percpu areas. Units which should be colocated are put into the
* same group. Dynamic VM areas will be allocated according to these
* groupings. If @ai->nr_groups is zero, a single group containing
* all units is assumed.
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
Tejun Heo
committed
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
* areas and the other for the dynamic area. They share the same vm
* and page map but uses different area allocation map to stay away
* from each other. The latter chunk is circulated in the chunk slots
* and available for dynamic allocation like any other chunks.
*
int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
void *base_addr)
Tejun Heo
committed
static int smap[2], dmap[2];
size_t dyn_size = ai->dyn_size;
size_t size_sum = ai->static_size + ai->reserved_size + dyn_size;
Tejun Heo
committed
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned int cpu;
int *unit_map;
int group, unit, i;
Tejun Heo
committed
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(ai->nr_groups <= 0);
BUG_ON(!ai->static_size);
BUG_ON(ai->unit_size < size_sum);
BUG_ON(ai->unit_size & ~PAGE_MASK);
BUG_ON(ai->unit_size < PCPU_MIN_UNIT_SIZE);
pcpu_dump_alloc_info(KERN_DEBUG, ai);
/* determine number of units and initialize unit_map and base */
unit_map = alloc_bootmem(nr_cpu_ids * sizeof(unit_map[0]));
unit_off = alloc_bootmem(nr_cpu_ids * sizeof(unit_off[0]));
for (cpu = 0; cpu < nr_cpu_ids; cpu++)
unit_map[cpu] = NR_CPUS;
pcpu_first_unit_cpu = NR_CPUS;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
for (i = 0; i < gi->nr_units; i++) {
cpu = gi->cpu_map[i];
if (cpu == NR_CPUS)
continue;
BUG_ON(cpu > nr_cpu_ids || !cpu_possible(cpu));
BUG_ON(unit_map[cpu] != NR_CPUS);
unit_map[cpu] = unit + i;
unit_off[cpu] = gi->base_offset + i * ai->unit_size;
if (pcpu_first_unit_cpu == NR_CPUS)
pcpu_first_unit_cpu = cpu;
}
pcpu_last_unit_cpu = cpu;
pcpu_nr_units = unit;
for_each_possible_cpu(cpu)
BUG_ON(unit_map[cpu] == NR_CPUS);
pcpu_unit_map = unit_map;
/* determine basic parameters */
pcpu_unit_pages = ai->unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
Tejun Heo
committed
/*
* Initialize static chunk. If reserved_size is zero, the
* static chunk covers static area + dynamic allocation area
* in the first chunk. If reserved_size is not zero, it
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
bitmap_fill(schunk->populated, pcpu_unit_pages);
Tejun Heo
committed
if (ai->reserved_size) {
schunk->free_size = ai->reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = ai->static_size + ai->reserved_size;
Tejun Heo
committed
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
schunk->map[schunk->map_used++] = -ai->static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
Tejun Heo
committed
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(pcpu_chunk_struct_size);
Tejun Heo
committed
INIT_LIST_HEAD(&dchunk->list);
Tejun Heo
committed
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
bitmap_fill(dchunk->populated, pcpu_unit_pages);
Tejun Heo
committed
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
[PCPU_FC_LPAGE] = "lpage",
};
enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
else if (!strcmp(str, "embed"))
pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
else if (!strcmp(str, "page"))
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
else if (!strcmp(str, "lpage"))
pcpu_chosen_fc = PCPU_FC_LPAGE;
#endif
else
pr_warning("PERCPU: unknown allocator %s specified\n", str);
return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* as a contiguous area using bootmem allocator and used as-is without
* being mapped into vmalloc area. This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
* specified to fill page alignment. When @dyn_size is auto,
* @dyn_size is just big enough to fill page alignment after static
* and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
*
* RETURNS:
int __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
struct pcpu_alloc_info *ai;
size_t size_sum, chunk_size;
ai = pcpu_build_alloc_info(reserved_size, dyn_size, PAGE_SIZE, NULL);
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
size_sum = ai->static_size + ai->reserved_size + ai->dyn_size;
chunk_size = ai->unit_size * num_possible_cpus();
base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
/* return the leftover and copy */
for (unit = 0; unit < num_possible_cpus(); unit++) {
void *ptr = base + unit * ai->unit_size;
free_bootmem(__pa(ptr + size_sum), ai->unit_size - size_sum);
memcpy(ptr, __per_cpu_load, ai->static_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, ai->static_size, ai->reserved_size,
ai->dyn_size, ai->unit_size);
out_free_ai:
pcpu_free_alloc_info(ai);
#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
!CONFIG_HAVE_SETUP_PER_CPU_AREA */
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
int __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
static struct vm_struct vm;
struct pcpu_alloc_info *ai;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
ai = pcpu_build_alloc_info(reserved_size, -1, PAGE_SIZE, NULL);
if (IS_ERR(ai))
return PTR_ERR(ai);
BUG_ON(ai->nr_groups != 1);
BUG_ON(ai->groups[0].nr_units != num_possible_cpus());
unit_pages = ai->unit_size >> PAGE_SHIFT;
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * num_possible_cpus() *
sizeof(pages[0]));
for (unit = 0; unit < num_possible_cpus(); unit++)
unsigned int cpu = ai->groups[0].cpu_map[unit];
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
pr_warning("PERCPU: failed to allocate %s page "
"for cpu%u\n", psize_str, cpu);
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * ai->unit_size;
vm_area_register_early(&vm, PAGE_SIZE);
for (unit = 0; unit < num_possible_cpus(); unit++) {
unsigned long unit_addr =
(unsigned long)vm.addr + unit * ai->unit_size;
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
rc = __pcpu_map_pages(unit_addr, &pages[unit * unit_pages],
unit_pages);
if (rc < 0)
panic("failed to map percpu area, err=%d\n", rc);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, ai->static_size);
pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, ai->static_size,
ai->reserved_size, ai->dyn_size);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
pcpu_free_alloc_info(ai);
#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
struct pcpul_ent {
void *ptr;
};
static size_t pcpul_size;
static size_t pcpul_lpage_size;
static int pcpul_nr_lpages;
static struct pcpul_ent *pcpul_map;
static bool __init pcpul_unit_to_cpu(int unit, const struct pcpu_alloc_info *ai,
for (group = 0, cunit = 0; group < ai->nr_groups; group++) {
const struct pcpu_group_info *gi = &ai->groups[group];
if (unit < cunit + gi->nr_units) {
*cpup = gi->cpu_map[unit - cunit];
cunit += gi->nr_units;
}
return false;
}
static int __init pcpul_cpu_to_unit(int cpu, const struct pcpu_alloc_info *ai)
{
int group, unit, i;
for (group = 0, unit = 0; group < ai->nr_groups; group++, unit += i) {
const struct pcpu_group_info *gi = &ai->groups[group];
for (i = 0; i < gi->nr_units; i++)
if (gi->cpu_map[i] == cpu)
return unit + i;
}
BUG();
}
/**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @ai: pcpu_alloc_info
* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
* @free_fn: function to free percpu memory, @size <= lpage_size
* @map_fn: function to map percpu lpage, always called with lpage_size
*
* This allocator uses large page to build and map the first chunk.
* Unlike other helpers, the caller should provide fully initialized
* @ai. This can be done using pcpu_build_alloc_info(). This two
* stage initialization is to allow arch code to evaluate the
* parameters before committing to it.
*
* Large pages are allocated as directed by @unit_map and other
* parameters and mapped to vmalloc space. Unused holes are returned
* to the page allocator. Note that these holes end up being actively
* mapped twice - once to the physical mapping and to the vmalloc area
* for the first percpu chunk. Depending on architecture, this might
* cause problem when changing page attributes of the returned area.
* These double mapped areas can be detected using
* pcpu_lpage_remapped().
int __init pcpu_lpage_first_chunk(const struct pcpu_alloc_info *ai,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
const size_t lpage_size = ai->atom_size;
size_t chunk_size, map_size;
nr_units = 0;
for (i = 0; i < ai->nr_groups; i++)
nr_units += ai->groups[i].nr_units;
chunk_size = ai->unit_size * nr_units;
BUG_ON(chunk_size % lpage_size);
pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
pcpul_lpage_size = lpage_size;
pcpul_nr_lpages = chunk_size / lpage_size;