Newer
Older
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
}
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
return chunk;
}
/**
Tejun Heo
committed
* pcpu_alloc - the percpu allocator
* @align: alignment of area (max PAGE_SIZE)
Tejun Heo
committed
* @reserved: allocate from the reserved chunk if available
* Allocate percpu area of @size bytes aligned at @align.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
Tejun Heo
committed
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
struct pcpu_chunk *chunk;
int slot, off;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
return NULL;
}
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
Tejun Heo
committed
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
if (size > chunk->contig_hint ||
pcpu_extend_area_map(chunk) < 0)
goto fail_unlock;
Tejun Heo
committed
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
goto fail_unlock;
Tejun Heo
committed
}
restart:
Tejun Heo
committed
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
continue;
switch (pcpu_extend_area_map(chunk)) {
case 0:
break;
case 1:
goto restart; /* pcpu_lock dropped, restart */
default:
goto fail_unlock;
}
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
}
}
/* hmmm... no space left, create a new chunk */
spin_unlock_irq(&pcpu_lock);
chunk = alloc_pcpu_chunk();
if (!chunk)
goto fail_unlock_mutex;
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
goto restart;
spin_unlock_irq(&pcpu_lock);
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
spin_lock_irq(&pcpu_lock);
goto fail_unlock;
mutex_unlock(&pcpu_alloc_mutex);
/* return address relative to unit0 */
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
fail_unlock:
spin_unlock_irq(&pcpu_lock);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
return NULL;
Tejun Heo
committed
/**
* __alloc_percpu - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
Tejun Heo
committed
/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align from reserved
* percpu area if arch has set it up; otherwise, allocation is served
* from the same dynamic area. Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true);
}
/**
* pcpu_reclaim - reclaim fully free chunks, workqueue function
* @work: unused
*
* Reclaim all fully free chunks except for the first one.
*
* CONTEXT:
* workqueue context.
*/
static void pcpu_reclaim(struct work_struct *work)
LIST_HEAD(todo);
struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */
if (chunk == list_first_entry(head, struct pcpu_chunk, list))
continue;
list_move(&chunk->list, &todo);
}
spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr.
*
* CONTEXT:
* Can be called from atomic context.
*/
void free_percpu(void *ptr)
{
void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
if (!ptr)
return;
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr;
pcpu_free_area(chunk, off);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_size == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
schedule_work(&pcpu_reclaim_work);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
EXPORT_SYMBOL_GPL(free_percpu);
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
* @base_addr: mapped address
* @unit_map: cpu -> unit map, NULL for sequential mapping
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
Tejun Heo
committed
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @dyn_size determines the number of bytes available for dynamic
* allocation in the first chunk. The area between @static_size +
* @reserved_size + @dyn_size and @unit_size is unused.
* @unit_size specifies unit size and must be aligned to PAGE_SIZE and
* equal to or larger than @static_size + @reserved_size + if
* non-negative, @dyn_size.
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
Tejun Heo
committed
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
* areas and the other for the dynamic area. They share the same vm
* and page map but uses different area allocation map to stay away
* from each other. The latter chunk is circulated in the chunk slots
* and available for dynamic allocation like any other chunks.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
size_t dyn_size, size_t unit_size,
void *base_addr, const int *unit_map)
static struct vm_struct first_vm;
Tejun Heo
committed
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size + dyn_size;
Tejun Heo
committed
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned int cpu, tcpu;
Tejun Heo
committed
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
BUG_ON(!base_addr);
BUG_ON(unit_size < size_sum);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
/* determine number of units and verify and initialize pcpu_unit_map */
if (unit_map) {
int first_unit = INT_MAX, last_unit = INT_MIN;
for_each_possible_cpu(cpu) {
int unit = unit_map[cpu];
BUG_ON(unit < 0);
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
/* the mapping should be one-to-one */
BUG_ON(unit_map[tcpu] == unit);
}
if (unit < first_unit) {
pcpu_first_unit_cpu = cpu;
first_unit = unit;
}
if (unit > last_unit) {
pcpu_last_unit_cpu = cpu;
last_unit = unit;
}
}
pcpu_nr_units = last_unit + 1;
pcpu_unit_map = unit_map;
} else {
int *identity_map;
/* #units == #cpus, identity mapped */
identity_map = alloc_bootmem(nr_cpu_ids *
sizeof(identity_map[0]));
for_each_possible_cpu(cpu)
identity_map[cpu] = cpu;
pcpu_first_unit_cpu = 0;
pcpu_last_unit_cpu = pcpu_nr_units - 1;
pcpu_nr_units = nr_cpu_ids;
pcpu_unit_map = identity_map;
}
/* determine basic parameters */
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
first_vm.addr = base_addr;
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
Tejun Heo
committed
/*
* Initialize static chunk. If reserved_size is zero, the
* static chunk covers static area + dynamic allocation area
* in the first chunk. If reserved_size is not zero, it
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
bitmap_fill(schunk->populated, pcpu_unit_pages);
Tejun Heo
committed
if (reserved_size) {
schunk->free_size = reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = static_size + reserved_size;
Tejun Heo
committed
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
schunk->map[schunk->map_used++] = -static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
Tejun Heo
committed
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(pcpu_chunk_struct_size);
Tejun Heo
committed
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
bitmap_fill(dchunk->populated, pcpu_unit_pages);
Tejun Heo
committed
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
pcpu_base_addr = schunk->vm->addr;
return pcpu_unit_size;
}
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
[PCPU_FC_LPAGE] = "lpage",
};
enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
else if (!strcmp(str, "embed"))
pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
else if (!strcmp(str, "page"))
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
else if (!strcmp(str, "lpage"))
pcpu_chosen_fc = PCPU_FC_LPAGE;
#endif
else
pr_warning("PERCPU: unknown allocator %s specified\n", str);
return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
{
size_t size_sum;
size_sum = PFN_ALIGN(static_size + reserved_size +
(*dyn_sizep >= 0 ? *dyn_sizep : 0));
if (*dyn_sizep != 0)
*dyn_sizep = size_sum - static_size - reserved_size;
return size_sum;
}
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* as a contiguous area using bootmem allocator and used as-is without
* being mapped into vmalloc area. This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
* specified to fill page alignment. When @dyn_size is auto,
* @dyn_size is just big enough to fill page alignment after static
* and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
const size_t static_size = __per_cpu_end - __per_cpu_start;
size_t size_sum, unit_size, chunk_size;
void *base;
unsigned int cpu;
/* determine parameters and allocate */
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
chunk_size = unit_size * nr_cpu_ids;
base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
/* return the leftover and copy */
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
if (cpu_possible(cpu)) {
free_bootmem(__pa(ptr + size_sum),
unit_size - size_sum);
memcpy(ptr, __per_cpu_load, static_size);
} else
free_bootmem(__pa(ptr), unit_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size,
unit_size);
return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
unit_size, base, NULL);
#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
!CONFIG_HAVE_SETUP_PER_CPU_AREA */
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
static struct vm_struct vm;
const size_t static_size = __per_cpu_end - __per_cpu_start;
ssize_t dyn_size = -1;
size_t size_sum, unit_size;
unsigned int cpu;
int i, j;
ssize_t ret;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
unit_pages = unit_size >> PAGE_SHIFT;
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
j = 0;
for_each_possible_cpu(cpu)
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
pr_warning("PERCPU: failed to allocate %s page "
"for cpu%u\n", psize_str, cpu);
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = nr_cpu_ids * unit_size;
vm_area_register_early(&vm, PAGE_SIZE);
for_each_possible_cpu(cpu) {
unsigned long unit_addr =
(unsigned long)vm.addr + cpu * unit_size;
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
unit_pages);
if (ret < 0)
panic("failed to map percpu area, err=%zd\n", ret);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, static_size);
}
pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, static_size, reserved_size,
dyn_size);
ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
unit_size, vm.addr, NULL);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
/**
* pcpu_lpage_build_unit_map - build unit_map for large page remapping
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_sizep: in/out parameter for dynamic size, -1 for auto
* @unit_sizep: out parameter for unit size
* @unit_map: unit_map to be filled
* @cpu_distance_fn: callback to determine distance between cpus
*
* This function builds cpu -> unit map and determine other parameters
* considering needed percpu size, large page size and distances
* between CPUs in NUMA.
*
* CPUs which are of LOCAL_DISTANCE both ways are grouped together and
* may share units in the same large page. The returned configuration
* is guaranteed to have CPUs on different nodes on different large
* pages and >=75% usage of allocated virtual address space.
*
* RETURNS:
* On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
* returns the number of units to be allocated. -errno on failure.
*/
int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
size_t *unit_sizep, size_t lpage_size,
int *unit_map,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
int group_cnt_max = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs;
unsigned int cpu, tcpu;
int group, unit;
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of lpage_size and is the smallest
* which can accomodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, lpage_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
upa--;
max_upa = upa;
/* group cpus according to their proximity */
for_each_possible_cpu(cpu) {
group = 0;
next_group:
for_each_possible_cpu(tcpu) {
if (cpu == tcpu)
break;
if (group_map[tcpu] == group &&
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
group++;
goto next_group;
}
}
group_map[cpu] = group;
group_cnt[group]++;
group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
* Expand unit size until address space usage goes over 75%
* and then as much as possible without using more address
* space.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
continue;
for (group = 0; group_cnt[group]; group++) {
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/*
* Don't accept if wastage is over 25%. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;
/* and then don't consume more memory */
if (allocs > last_allocs)
break;
last_allocs = allocs;
best_upa = upa;
}
*unit_sizep = alloc_size / best_upa;
/* assign units to cpus accordingly */
unit = 0;
for (group = 0; group_cnt[group]; group++) {
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
unit_map[cpu] = unit++;
unit = roundup(unit, best_upa);
}
return unit; /* unit contains aligned number of units */
}
struct pcpul_ent {
void *ptr;
};
static size_t pcpul_size;
static size_t pcpul_lpage_size;
static int pcpul_nr_lpages;
static struct pcpul_ent *pcpul_map;
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
unsigned int *cpup)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
if (unit_map[cpu] == unit) {
if (cpup)
*cpup = cpu;
return true;
}
return false;
}
static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
size_t reserved_size, size_t dyn_size,
size_t unit_size, size_t lpage_size,
const int *unit_map, int nr_units)
{
int width = 1, v = nr_units;
char empty_str[] = "--------";
int upl, lpl; /* units per lpage, lpage per line */
unsigned int cpu;
int lpage, unit;
while (v /= 10)
width++;
empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
upl = max_t(int, lpage_size / unit_size, 1);
lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
static_size, reserved_size, dyn_size, unit_size, lpage_size);
for (lpage = 0, unit = 0; unit < nr_units; unit++) {
if (!(unit % upl)) {
if (!(lpage++ % lpl)) {
printk("\n");
printk("%spcpu-lpage: ", lvl);
} else
printk("| ");
}
if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
printk("%0*d ", width, cpu);
else
printk("%s ", empty_str);
}
printk("\n");
}
/**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes
* @unit_size: unit size in bytes
* @lpage_size: the size of a large page
* @unit_map: cpu -> unit mapping
* @nr_units: the number of units
* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
* @free_fn: function to free percpu memory, @size <= lpage_size
* @map_fn: function to map percpu lpage, always called with lpage_size
*
* This allocator uses large page to build and map the first chunk.
* Unlike other helpers, the caller should always specify @dyn_size
* and @unit_size. These parameters along with @unit_map and
* @nr_units can be determined using pcpu_lpage_build_unit_map().
* This two stage initialization is to allow arch code to evaluate the
* parameters before committing to it.
*
* Large pages are allocated as directed by @unit_map and other
* parameters and mapped to vmalloc space. Unused holes are returned
* to the page allocator. Note that these holes end up being actively
* mapped twice - once to the physical mapping and to the vmalloc area
* for the first percpu chunk. Depending on architecture, this might
* cause problem when changing page attributes of the returned area.
* These double mapped areas can be detected using
* pcpu_lpage_remapped().
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
size_t unit_size, size_t lpage_size,
const int *unit_map, int nr_units,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
{
const size_t static_size = __per_cpu_end - __per_cpu_start;
size_t chunk_size = unit_size * nr_units;
size_t map_size;
unsigned int cpu;
ssize_t ret;
pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
unit_size, lpage_size, unit_map, nr_units);
BUG_ON(chunk_size % lpage_size);
pcpul_size = static_size + reserved_size + dyn_size;
pcpul_lpage_size = lpage_size;
pcpul_nr_lpages = chunk_size / lpage_size;
/* allocate pointer array and alloc large pages */
map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
pcpul_map = alloc_bootmem(map_size);
/* allocate all pages */
for (i = 0; i < pcpul_nr_lpages; i++) {
size_t offset = i * lpage_size;
int first_unit = offset / unit_size;
int last_unit = (offset + lpage_size - 1) / unit_size;
/* find out which cpu is mapped to this unit */
for (unit = first_unit; unit <= last_unit; unit++)
if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
goto found;
continue;
found:
ptr = alloc_fn(cpu, lpage_size, lpage_size);
if (!ptr) {
pr_warning("PERCPU: failed to allocate large page "
"for cpu%u\n", cpu);
goto enomem;
}
pcpul_map[i].ptr = ptr;
}
/* return unused holes */
for (unit = 0; unit < nr_units; unit++) {
size_t start = unit * unit_size;
size_t end = start + unit_size;
size_t off, next;
/* don't free used part of occupied unit */
if (pcpul_unit_to_cpu(unit, unit_map, NULL))
start += pcpul_size;
/* unit can span more than one page, punch the holes */
for (off = start; off < end; off = next) {
void *ptr = pcpul_map[off / lpage_size].ptr;
next = min(roundup(off + 1, lpage_size), end);
if (ptr)
free_fn(ptr + off % lpage_size, next - off);
}
/* allocate address, map and copy */
vm.flags = VM_ALLOC;
vm.size = chunk_size;
vm_area_register_early(&vm, unit_size);
for (i = 0; i < pcpul_nr_lpages; i++) {
if (!pcpul_map[i].ptr)
continue;
pcpul_map[i].map_addr = vm.addr + i * lpage_size;
map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
}
for_each_possible_cpu(cpu)
memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
static_size);
/* we're ready, commit */
pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
vm.addr, static_size, reserved_size, dyn_size, unit_size);
ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
unit_size, vm.addr, unit_map);
/*
* Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
* lpages are pushed to the end and trimmed.
*/
for (i = 0; i < pcpul_nr_lpages - 1; i++)
for (j = i + 1; j < pcpul_nr_lpages; j++) {
struct pcpul_ent tmp;
if (!pcpul_map[j].ptr)
continue;
if (pcpul_map[i].ptr &&
pcpul_map[i].ptr < pcpul_map[j].ptr)
continue;
tmp = pcpul_map[i];
pcpul_map[i] = pcpul_map[j];
pcpul_map[j] = tmp;
}
while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
pcpul_nr_lpages--;
return ret;
enomem:
for (i = 0; i < pcpul_nr_lpages; i++)
if (pcpul_map[i].ptr)
free_fn(pcpul_map[i].ptr, lpage_size);
1971
1972
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
free_bootmem(__pa(pcpul_map), map_size);
return -ENOMEM;
}
/**
* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
* @kaddr: the kernel address in question
*
* Determine whether @kaddr falls in the pcpul recycled area. This is
* used by pageattr to detect VM aliases and break up the pcpu large
* page mapping such that the same physical page is not mapped under
* different attributes.
*
* The recycled area is always at the tail of a partially used large
* page.
*
* RETURNS:
* Address of corresponding remapped pcpu address if match is found;
* otherwise, NULL.
*/
void *pcpu_lpage_remapped(void *kaddr)
{
unsigned long lpage_mask = pcpul_lpage_size - 1;
void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
unsigned long offset = (unsigned long)kaddr & lpage_mask;
int left = 0, right = pcpul_nr_lpages - 1;