Newer
Older
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
}
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
return chunk;
}
/**
Tejun Heo
committed
* pcpu_alloc - the percpu allocator
* @align: alignment of area (max PAGE_SIZE)
Tejun Heo
committed
* @reserved: allocate from the reserved chunk if available
* Allocate percpu area of @size bytes aligned at @align.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
Tejun Heo
committed
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
struct pcpu_chunk *chunk;
int slot, off;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
return NULL;
}
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
Tejun Heo
committed
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
if (size > chunk->contig_hint ||
pcpu_extend_area_map(chunk) < 0)
goto fail_unlock;
Tejun Heo
committed
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
goto fail_unlock;
Tejun Heo
committed
}
restart:
Tejun Heo
committed
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
continue;
switch (pcpu_extend_area_map(chunk)) {
case 0:
break;
case 1:
goto restart; /* pcpu_lock dropped, restart */
default:
goto fail_unlock;
}
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
}
}
/* hmmm... no space left, create a new chunk */
spin_unlock_irq(&pcpu_lock);
chunk = alloc_pcpu_chunk();
if (!chunk)
goto fail_unlock_mutex;
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
goto restart;
spin_unlock_irq(&pcpu_lock);
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
spin_lock_irq(&pcpu_lock);
goto fail_unlock;
mutex_unlock(&pcpu_alloc_mutex);
/* return address relative to unit0 */
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
fail_unlock:
spin_unlock_irq(&pcpu_lock);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
return NULL;
Tejun Heo
committed
/**
* __alloc_percpu - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
Tejun Heo
committed
/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align from reserved
* percpu area if arch has set it up; otherwise, allocation is served
* from the same dynamic area. Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true);
}
/**
* pcpu_reclaim - reclaim fully free chunks, workqueue function
* @work: unused
*
* Reclaim all fully free chunks except for the first one.
*
* CONTEXT:
* workqueue context.
*/
static void pcpu_reclaim(struct work_struct *work)
LIST_HEAD(todo);
struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */
if (chunk == list_first_entry(head, struct pcpu_chunk, list))
continue;
list_move(&chunk->list, &todo);
}
spin_unlock_irq(&pcpu_lock);
mutex_unlock(&pcpu_alloc_mutex);
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr.
*
* CONTEXT:
* Can be called from atomic context.
*/
void free_percpu(void *ptr)
{
void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
if (!ptr)
return;
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr;
pcpu_free_area(chunk, off);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_size == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
schedule_work(&pcpu_reclaim_work);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
EXPORT_SYMBOL_GPL(free_percpu);
/**
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
* @base_addr: mapped address
* @unit_map: cpu -> unit map, NULL for sequential mapping
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
Tejun Heo
committed
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @dyn_size, if non-negative, determines the number of bytes
* available for dynamic allocation in the first chunk. Specifying
* non-negative value makes percpu leave alone the area beyond
* @static_size + @reserved_size + @dyn_size.
*
* @unit_size specifies unit size and must be aligned to PAGE_SIZE and
* equal to or larger than @static_size + @reserved_size + if
* non-negative, @dyn_size.
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
Tejun Heo
committed
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
* areas and the other for the dynamic area. They share the same vm
* and page map but uses different area allocation map to stay away
* from each other. The latter chunk is circulated in the chunk slots
* and available for dynamic allocation like any other chunks.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t unit_size,
void *base_addr, const int *unit_map)
static struct vm_struct first_vm;
Tejun Heo
committed
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0);
Tejun Heo
committed
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned int cpu, tcpu;
Tejun Heo
committed
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
BUG_ON(!base_addr);
BUG_ON(unit_size < size_sum);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
/* determine number of units and verify and initialize pcpu_unit_map */
if (unit_map) {
int first_unit = INT_MAX, last_unit = INT_MIN;
for_each_possible_cpu(cpu) {
int unit = unit_map[cpu];
BUG_ON(unit < 0);
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
/* the mapping should be one-to-one */
BUG_ON(unit_map[tcpu] == unit);
}
if (unit < first_unit) {
pcpu_first_unit_cpu = cpu;
first_unit = unit;
}
if (unit > last_unit) {
pcpu_last_unit_cpu = cpu;
last_unit = unit;
}
}
pcpu_nr_units = last_unit + 1;
pcpu_unit_map = unit_map;
} else {
int *identity_map;
/* #units == #cpus, identity mapped */
identity_map = alloc_bootmem(nr_cpu_ids *
sizeof(identity_map[0]));
for_each_possible_cpu(cpu)
identity_map[cpu] = cpu;
pcpu_first_unit_cpu = 0;
pcpu_last_unit_cpu = pcpu_nr_units - 1;
pcpu_nr_units = nr_cpu_ids;
pcpu_unit_map = identity_map;
}
/* determine basic parameters */
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
if (dyn_size < 0)
Tejun Heo
committed
dyn_size = pcpu_unit_size - static_size - reserved_size;
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
first_vm.addr = base_addr;
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
Tejun Heo
committed
/*
* Initialize static chunk. If reserved_size is zero, the
* static chunk covers static area + dynamic allocation area
* in the first chunk. If reserved_size is not zero, it
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
bitmap_fill(schunk->populated, pcpu_unit_pages);
Tejun Heo
committed
if (reserved_size) {
schunk->free_size = reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = static_size + reserved_size;
Tejun Heo
committed
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
schunk->map[schunk->map_used++] = -static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
Tejun Heo
committed
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(pcpu_chunk_struct_size);
Tejun Heo
committed
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
bitmap_fill(dchunk->populated, pcpu_unit_pages);
Tejun Heo
committed
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
pcpu_base_addr = schunk->vm->addr;
return pcpu_unit_size;
}
static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
ssize_t *dyn_sizep)
{
size_t size_sum;
size_sum = PFN_ALIGN(static_size + reserved_size +
(*dyn_sizep >= 0 ? *dyn_sizep : 0));
if (*dyn_sizep != 0)
*dyn_sizep = size_sum - static_size - reserved_size;
return size_sum;
}
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* as a contiguous area using bootmem allocator and used as-is without
* being mapped into vmalloc area. This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
* specified to fill page alignment. When @dyn_size is auto,
* @dyn_size is just big enough to fill page alignment after static
* and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
size_t size_sum, unit_size, chunk_size;
void *base;
unsigned int cpu;
/* determine parameters and allocate */
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
chunk_size = unit_size * nr_cpu_ids;
base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
/* return the leftover and copy */
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
if (cpu_possible(cpu)) {
free_bootmem(__pa(ptr + size_sum),
unit_size - size_sum);
memcpy(ptr, __per_cpu_load, static_size);
} else
free_bootmem(__pa(ptr), unit_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
size_sum >> PAGE_SHIFT, base, static_size);
return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
unit_size, base, NULL);
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
}
/**
* pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct vm;
unsigned int cpu;
int i, j;
ssize_t ret;
unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
PCPU_MIN_UNIT_SIZE));
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
j = 0;
for_each_possible_cpu(cpu)
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE);
if (!ptr) {
pr_warning("PERCPU: failed to allocate "
"4k page for cpu%u\n", cpu);
goto enomem;
}
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = nr_cpu_ids * unit_pages << PAGE_SHIFT;
vm_area_register_early(&vm, PAGE_SIZE);
for_each_possible_cpu(cpu) {
unsigned long unit_addr = (unsigned long)vm.addr +
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
unit_pages);
if (ret < 0)
panic("failed to map percpu area, err=%zd\n", ret);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, static_size);
}
pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
ret = pcpu_setup_first_chunk(static_size, reserved_size, -1,
unit_pages << PAGE_SHIFT, vm.addr, NULL);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
/*
* Large page remapping first chunk setup helper
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
/**
* pcpu_lpage_build_unit_map - build unit_map for large page remapping
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_sizep: in/out parameter for dynamic size, -1 for auto
* @unit_sizep: out parameter for unit size
* @unit_map: unit_map to be filled
* @cpu_distance_fn: callback to determine distance between cpus
*
* This function builds cpu -> unit map and determine other parameters
* considering needed percpu size, large page size and distances
* between CPUs in NUMA.
*
* CPUs which are of LOCAL_DISTANCE both ways are grouped together and
* may share units in the same large page. The returned configuration
* is guaranteed to have CPUs on different nodes on different large
* pages and >=75% usage of allocated virtual address space.
*
* RETURNS:
* On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
* returns the number of units to be allocated. -errno on failure.
*/
int __init pcpu_lpage_build_unit_map(size_t static_size, size_t reserved_size,
ssize_t *dyn_sizep, size_t *unit_sizep,
size_t lpage_size, int *unit_map,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
int group_cnt_max = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs;
unsigned int cpu, tcpu;
int group, unit;
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of lpage_size and is the smallest
* which can accomodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, lpage_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
upa--;
max_upa = upa;
/* group cpus according to their proximity */
for_each_possible_cpu(cpu) {
group = 0;
next_group:
for_each_possible_cpu(tcpu) {
if (cpu == tcpu)
break;
if (group_map[tcpu] == group &&
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
group++;
goto next_group;
}
}
group_map[cpu] = group;
group_cnt[group]++;
group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
* Expand unit size until address space usage goes over 75%
* and then as much as possible without using more address
* space.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
continue;
for (group = 0; group_cnt[group]; group++) {
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/*
* Don't accept if wastage is over 25%. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;
/* and then don't consume more memory */
if (allocs > last_allocs)
break;
last_allocs = allocs;
best_upa = upa;
}
*unit_sizep = alloc_size / best_upa;
/* assign units to cpus accordingly */
unit = 0;
for (group = 0; group_cnt[group]; group++) {
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
unit_map[cpu] = unit++;
unit = roundup(unit, best_upa);
}
return unit; /* unit contains aligned number of units */
}
struct pcpul_ent {
void *ptr;
};
static size_t pcpul_size;
static size_t pcpul_lpage_size;
static int pcpul_nr_lpages;
static struct pcpul_ent *pcpul_map;
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
unsigned int *cpup)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
if (unit_map[cpu] == unit) {
if (cpup)
*cpup = cpu;
return true;
}
return false;
}
static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
size_t reserved_size, size_t dyn_size,
size_t unit_size, size_t lpage_size,
const int *unit_map, int nr_units)
{
int width = 1, v = nr_units;
char empty_str[] = "--------";
int upl, lpl; /* units per lpage, lpage per line */
unsigned int cpu;
int lpage, unit;
while (v /= 10)
width++;
empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
upl = max_t(int, lpage_size / unit_size, 1);
lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
static_size, reserved_size, dyn_size, unit_size, lpage_size);
for (lpage = 0, unit = 0; unit < nr_units; unit++) {
if (!(unit % upl)) {
if (!(lpage++ % lpl)) {
printk("\n");
printk("%spcpu-lpage: ", lvl);
} else
printk("| ");
}
if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
printk("%0*d ", width, cpu);
else
printk("%s ", empty_str);
}
printk("\n");
}
/**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes
* @unit_size: unit size in bytes
* @lpage_size: the size of a large page
* @unit_map: cpu -> unit mapping
* @nr_units: the number of units
* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
* @free_fn: function to free percpu memory, @size <= lpage_size
* @map_fn: function to map percpu lpage, always called with lpage_size
*
* This allocator uses large page to build and map the first chunk.
* Unlike other helpers, the caller should always specify @dyn_size
* and @unit_size. These parameters along with @unit_map and
* @nr_units can be determined using pcpu_lpage_build_unit_map().
* This two stage initialization is to allow arch code to evaluate the
* parameters before committing to it.
*
* Large pages are allocated as directed by @unit_map and other
* parameters and mapped to vmalloc space. Unused holes are returned
* to the page allocator. Note that these holes end up being actively
* mapped twice - once to the physical mapping and to the vmalloc area
* for the first percpu chunk. Depending on architecture, this might
* cause problem when changing page attributes of the returned area.
* These double mapped areas can be detected using
* pcpu_lpage_remapped().
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
size_t dyn_size, size_t unit_size,
size_t lpage_size, const int *unit_map,
int nr_units,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
{
static struct vm_struct vm;
size_t chunk_size = unit_size * nr_units;
size_t map_size;
unsigned int cpu;
ssize_t ret;
pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
unit_size, lpage_size, unit_map, nr_units);
BUG_ON(chunk_size % lpage_size);
pcpul_size = static_size + reserved_size + dyn_size;
pcpul_lpage_size = lpage_size;
pcpul_nr_lpages = chunk_size / lpage_size;
/* allocate pointer array and alloc large pages */
map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
pcpul_map = alloc_bootmem(map_size);
/* allocate all pages */
for (i = 0; i < pcpul_nr_lpages; i++) {
size_t offset = i * lpage_size;
int first_unit = offset / unit_size;
int last_unit = (offset + lpage_size - 1) / unit_size;
/* find out which cpu is mapped to this unit */
for (unit = first_unit; unit <= last_unit; unit++)
if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
goto found;
continue;
found:
ptr = alloc_fn(cpu, lpage_size);
if (!ptr) {
pr_warning("PERCPU: failed to allocate large page "
"for cpu%u\n", cpu);
goto enomem;
}
pcpul_map[i].ptr = ptr;
}
/* return unused holes */
for (unit = 0; unit < nr_units; unit++) {
size_t start = unit * unit_size;
size_t end = start + unit_size;
size_t off, next;
/* don't free used part of occupied unit */
if (pcpul_unit_to_cpu(unit, unit_map, NULL))
start += pcpul_size;
/* unit can span more than one page, punch the holes */
for (off = start; off < end; off = next) {
void *ptr = pcpul_map[off / lpage_size].ptr;
next = min(roundup(off + 1, lpage_size), end);
if (ptr)
free_fn(ptr + off % lpage_size, next - off);
}
/* allocate address, map and copy */
vm.flags = VM_ALLOC;
vm.size = chunk_size;
vm_area_register_early(&vm, unit_size);
for (i = 0; i < pcpul_nr_lpages; i++) {
if (!pcpul_map[i].ptr)
continue;
pcpul_map[i].map_addr = vm.addr + i * lpage_size;
map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
}
for_each_possible_cpu(cpu)
memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
static_size);
/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", vm.addr, static_size);
ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913
1914
1915
1916
1917
1918
1919
1920
1921
1922
1923
1924
1925
1926
unit_size, vm.addr, unit_map);
/*
* Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
* lpages are pushed to the end and trimmed.
*/
for (i = 0; i < pcpul_nr_lpages - 1; i++)
for (j = i + 1; j < pcpul_nr_lpages; j++) {
struct pcpul_ent tmp;
if (!pcpul_map[j].ptr)
continue;
if (pcpul_map[i].ptr &&
pcpul_map[i].ptr < pcpul_map[j].ptr)
continue;
tmp = pcpul_map[i];
pcpul_map[i] = pcpul_map[j];
pcpul_map[j] = tmp;
}
while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
pcpul_nr_lpages--;
return ret;
enomem:
for (i = 0; i < pcpul_nr_lpages; i++)
if (pcpul_map[i].ptr)
free_fn(pcpul_map[i].ptr, lpage_size);
1934
1935
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
free_bootmem(__pa(pcpul_map), map_size);
return -ENOMEM;
}
/**
* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
* @kaddr: the kernel address in question
*
* Determine whether @kaddr falls in the pcpul recycled area. This is
* used by pageattr to detect VM aliases and break up the pcpu large
* page mapping such that the same physical page is not mapped under
* different attributes.
*
* The recycled area is always at the tail of a partially used large
* page.
*
* RETURNS:
* Address of corresponding remapped pcpu address if match is found;
* otherwise, NULL.
*/
void *pcpu_lpage_remapped(void *kaddr)
{
unsigned long lpage_mask = pcpul_lpage_size - 1;
void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
unsigned long offset = (unsigned long)kaddr & lpage_mask;
int left = 0, right = pcpul_nr_lpages - 1;
int pos;
/* pcpul in use at all? */
if (!pcpul_map)
return NULL;
/* okay, perform binary search */
while (left <= right) {
pos = (left + right) / 2;
if (pcpul_map[pos].ptr < lpage_addr)
left = pos + 1;
else if (pcpul_map[pos].ptr > lpage_addr)
right = pos - 1;
else
return pcpul_map[pos].map_addr + offset;
}
return NULL;
}
#endif
/*
* Generic percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
* important because many archs have addressing restrictions and might
* fail if the percpu area is located far away from the previous
* location. As an added bonus, in non-NUMA cases, embedding is
* generally a good idea TLB-wise because percpu area can piggy back
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
void __init setup_per_cpu_areas(void)
{
size_t static_size = __per_cpu_end - __per_cpu_start;