Newer
Older
chunk->map = pcpu_mem_alloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0]));
chunk->map_alloc = PCPU_DFL_MAP_ALLOC;
chunk->map[chunk->map_used++] = pcpu_unit_size;
chunk->vm = get_vm_area(pcpu_chunk_size, VM_ALLOC);
if (!chunk->vm) {
free_pcpu_chunk(chunk);
return NULL;
}
INIT_LIST_HEAD(&chunk->list);
chunk->free_size = pcpu_unit_size;
chunk->contig_hint = pcpu_unit_size;
return chunk;
}
/**
Tejun Heo
committed
* pcpu_alloc - the percpu allocator
* @align: alignment of area (max PAGE_SIZE)
Tejun Heo
committed
* @reserved: allocate from the reserved chunk if available
* Allocate percpu area of @size bytes aligned at @align.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
Tejun Heo
committed
static void *pcpu_alloc(size_t size, size_t align, bool reserved)
{
struct pcpu_chunk *chunk;
int slot, off;
if (unlikely(!size || size > PCPU_MIN_UNIT_SIZE || align > PAGE_SIZE)) {
WARN(true, "illegal size (%zu) or align (%zu) for "
"percpu allocation\n", size, align);
return NULL;
}
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
Tejun Heo
committed
/* serve reserved allocations from the reserved chunk if available */
if (reserved && pcpu_reserved_chunk) {
chunk = pcpu_reserved_chunk;
if (size > chunk->contig_hint ||
pcpu_extend_area_map(chunk) < 0)
goto fail_unlock;
Tejun Heo
committed
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
goto fail_unlock;
Tejun Heo
committed
}
restart:
Tejun Heo
committed
/* search through normal chunks */
for (slot = pcpu_size_to_slot(size); slot < pcpu_nr_slots; slot++) {
list_for_each_entry(chunk, &pcpu_slot[slot], list) {
if (size > chunk->contig_hint)
continue;
switch (pcpu_extend_area_map(chunk)) {
case 0:
break;
case 1:
goto restart; /* pcpu_lock dropped, restart */
default:
goto fail_unlock;
}
off = pcpu_alloc_area(chunk, size, align);
if (off >= 0)
goto area_found;
}
}
/* hmmm... no space left, create a new chunk */
spin_unlock_irq(&pcpu_lock);
chunk = alloc_pcpu_chunk();
if (!chunk)
goto fail_unlock_mutex;
spin_lock_irq(&pcpu_lock);
pcpu_chunk_relocate(chunk, -1);
goto restart;
spin_unlock_irq(&pcpu_lock);
/* populate, map and clear the area */
if (pcpu_populate_chunk(chunk, off, size)) {
spin_lock_irq(&pcpu_lock);
goto fail_unlock;
mutex_unlock(&pcpu_alloc_mutex);
/* return address relative to unit0 */
return __addr_to_pcpu_ptr(chunk->vm->addr + off);
fail_unlock:
spin_unlock_irq(&pcpu_lock);
fail_unlock_mutex:
mutex_unlock(&pcpu_alloc_mutex);
return NULL;
Tejun Heo
committed
/**
* __alloc_percpu - allocate dynamic percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align. Might
* sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, false);
}
EXPORT_SYMBOL_GPL(__alloc_percpu);
Tejun Heo
committed
/**
* __alloc_reserved_percpu - allocate reserved percpu area
* @size: size of area to allocate in bytes
* @align: alignment of area (max PAGE_SIZE)
*
* Allocate percpu area of @size bytes aligned at @align from reserved
* percpu area if arch has set it up; otherwise, allocation is served
* from the same dynamic area. Might sleep. Might trigger writeouts.
*
* CONTEXT:
* Does GFP_KERNEL allocation.
*
Tejun Heo
committed
* RETURNS:
* Percpu pointer to the allocated area on success, NULL on failure.
*/
void *__alloc_reserved_percpu(size_t size, size_t align)
{
return pcpu_alloc(size, align, true);
}
/**
* pcpu_reclaim - reclaim fully free chunks, workqueue function
* @work: unused
*
* Reclaim all fully free chunks except for the first one.
*
* CONTEXT:
* workqueue context.
*/
static void pcpu_reclaim(struct work_struct *work)
LIST_HEAD(todo);
struct list_head *head = &pcpu_slot[pcpu_nr_slots - 1];
struct pcpu_chunk *chunk, *next;
mutex_lock(&pcpu_alloc_mutex);
spin_lock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, head, list) {
WARN_ON(chunk->immutable);
/* spare the first one */
if (chunk == list_first_entry(head, struct pcpu_chunk, list))
continue;
list_move(&chunk->list, &todo);
}
spin_unlock_irq(&pcpu_lock);
list_for_each_entry_safe(chunk, next, &todo, list) {
pcpu_depopulate_chunk(chunk, 0, pcpu_unit_size);
free_pcpu_chunk(chunk);
}
}
/**
* free_percpu - free percpu area
* @ptr: pointer to area to free
*
* Free percpu area @ptr.
*
* CONTEXT:
* Can be called from atomic context.
*/
void free_percpu(void *ptr)
{
void *addr = __pcpu_ptr_to_addr(ptr);
struct pcpu_chunk *chunk;
unsigned long flags;
int off;
if (!ptr)
return;
spin_lock_irqsave(&pcpu_lock, flags);
chunk = pcpu_chunk_addr_search(addr);
off = addr - chunk->vm->addr;
pcpu_free_area(chunk, off);
/* if there are more than one fully free chunks, wake up grim reaper */
if (chunk->free_size == pcpu_unit_size) {
struct pcpu_chunk *pos;
list_for_each_entry(pos, &pcpu_slot[pcpu_nr_slots - 1], list)
schedule_work(&pcpu_reclaim_work);
spin_unlock_irqrestore(&pcpu_lock, flags);
}
EXPORT_SYMBOL_GPL(free_percpu);
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
static inline size_t pcpu_calc_fc_sizes(size_t static_size,
size_t reserved_size,
ssize_t *dyn_sizep)
{
size_t size_sum;
size_sum = PFN_ALIGN(static_size + reserved_size +
(*dyn_sizep >= 0 ? *dyn_sizep : 0));
if (*dyn_sizep != 0)
*dyn_sizep = size_sum - static_size - reserved_size;
return size_sum;
}
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
/**
* pcpu_lpage_build_unit_map - build unit_map for large page remapping
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_sizep: in/out parameter for dynamic size, -1 for auto
* @unit_sizep: out parameter for unit size
* @unit_map: unit_map to be filled
* @cpu_distance_fn: callback to determine distance between cpus
*
* This function builds cpu -> unit map and determine other parameters
* considering needed percpu size, large page size and distances
* between CPUs in NUMA.
*
* CPUs which are of LOCAL_DISTANCE both ways are grouped together and
* may share units in the same large page. The returned configuration
* is guaranteed to have CPUs on different nodes on different large
* pages and >=75% usage of allocated virtual address space.
*
* RETURNS:
* On success, fills in @unit_map, sets *@dyn_sizep, *@unit_sizep and
* returns the number of units to be allocated. -errno on failure.
*/
int __init pcpu_lpage_build_unit_map(size_t reserved_size, ssize_t *dyn_sizep,
size_t *unit_sizep, size_t lpage_size,
int *unit_map,
pcpu_fc_cpu_distance_fn_t cpu_distance_fn)
{
static int group_map[NR_CPUS] __initdata;
static int group_cnt[NR_CPUS] __initdata;
const size_t static_size = __per_cpu_end - __per_cpu_start;
int group_cnt_max = 0;
size_t size_sum, min_unit_size, alloc_size;
int upa, max_upa, uninitialized_var(best_upa); /* units_per_alloc */
int last_allocs;
unsigned int cpu, tcpu;
int group, unit;
/*
* Determine min_unit_size, alloc_size and max_upa such that
* alloc_size is multiple of lpage_size and is the smallest
* which can accomodate 4k aligned segments which are equal to
* or larger than min_unit_size.
*/
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, dyn_sizep);
min_unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
alloc_size = roundup(min_unit_size, lpage_size);
upa = alloc_size / min_unit_size;
while (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
upa--;
max_upa = upa;
/* group cpus according to their proximity */
for_each_possible_cpu(cpu) {
group = 0;
next_group:
for_each_possible_cpu(tcpu) {
if (cpu == tcpu)
break;
if (group_map[tcpu] == group &&
(cpu_distance_fn(cpu, tcpu) > LOCAL_DISTANCE ||
cpu_distance_fn(tcpu, cpu) > LOCAL_DISTANCE)) {
group++;
goto next_group;
}
}
group_map[cpu] = group;
group_cnt[group]++;
group_cnt_max = max(group_cnt_max, group_cnt[group]);
}
/*
* Expand unit size until address space usage goes over 75%
* and then as much as possible without using more address
* space.
*/
last_allocs = INT_MAX;
for (upa = max_upa; upa; upa--) {
int allocs = 0, wasted = 0;
if (alloc_size % upa || ((alloc_size / upa) & ~PAGE_MASK))
continue;
for (group = 0; group_cnt[group]; group++) {
int this_allocs = DIV_ROUND_UP(group_cnt[group], upa);
allocs += this_allocs;
wasted += this_allocs * upa - group_cnt[group];
}
/*
* Don't accept if wastage is over 25%. The
* greater-than comparison ensures upa==1 always
* passes the following check.
*/
if (wasted > num_possible_cpus() / 3)
continue;
/* and then don't consume more memory */
if (allocs > last_allocs)
break;
last_allocs = allocs;
best_upa = upa;
}
*unit_sizep = alloc_size / best_upa;
/* assign units to cpus accordingly */
unit = 0;
for (group = 0; group_cnt[group]; group++) {
for_each_possible_cpu(cpu)
if (group_map[cpu] == group)
unit_map[cpu] = unit++;
unit = roundup(unit, best_upa);
}
return unit; /* unit contains aligned number of units */
}
static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
unsigned int *cpup);
static void __init pcpul_lpage_dump_cfg(const char *lvl, size_t static_size,
size_t reserved_size, size_t dyn_size,
size_t unit_size, size_t lpage_size,
const int *unit_map, int nr_units)
{
int width = 1, v = nr_units;
char empty_str[] = "--------";
int upl, lpl; /* units per lpage, lpage per line */
unsigned int cpu;
int lpage, unit;
while (v /= 10)
width++;
empty_str[min_t(int, width, sizeof(empty_str) - 1)] = '\0';
upl = max_t(int, lpage_size / unit_size, 1);
lpl = rounddown_pow_of_two(max_t(int, 60 / (upl * (width + 1) + 2), 1));
printk("%spcpu-lpage: sta/res/dyn=%zu/%zu/%zu unit=%zu lpage=%zu", lvl,
static_size, reserved_size, dyn_size, unit_size, lpage_size);
for (lpage = 0, unit = 0; unit < nr_units; unit++) {
if (!(unit % upl)) {
if (!(lpage++ % lpl)) {
printk("\n");
printk("%spcpu-lpage: ", lvl);
} else
printk("| ");
}
if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
printk("%0*d ", width, cpu);
else
printk("%s ", empty_str);
}
printk("\n");
}
#endif
* pcpu_setup_first_chunk - initialize the first percpu chunk
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes, 0 for none
* @dyn_size: free size for dynamic allocation in bytes
* @unit_size: unit size in bytes, must be multiple of PAGE_SIZE
* @base_addr: mapped address
* @unit_map: cpu -> unit map, NULL for sequential mapping
*
* Initialize the first percpu chunk which contains the kernel static
* perpcu area. This function is to be called from arch percpu area
Tejun Heo
committed
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @dyn_size determines the number of bytes available for dynamic
* allocation in the first chunk. The area between @static_size +
* @reserved_size + @dyn_size and @unit_size is unused.
* @unit_size specifies unit size and must be aligned to PAGE_SIZE and
* equal to or larger than @static_size + @reserved_size + if
* non-negative, @dyn_size.
* The caller should have mapped the first chunk at @base_addr and
* copied static data to each unit.
Tejun Heo
committed
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
* areas and the other for the dynamic area. They share the same vm
* and page map but uses different area allocation map to stay away
* from each other. The latter chunk is circulated in the chunk slots
* and available for dynamic allocation like any other chunks.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(size_t static_size, size_t reserved_size,
size_t dyn_size, size_t unit_size,
void *base_addr, const int *unit_map)
static struct vm_struct first_vm;
Tejun Heo
committed
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size + dyn_size;
Tejun Heo
committed
struct pcpu_chunk *schunk, *dchunk = NULL;
unsigned int cpu, tcpu;
Tejun Heo
committed
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
BUG_ON(!base_addr);
BUG_ON(unit_size < size_sum);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
/* determine number of units and verify and initialize pcpu_unit_map */
if (unit_map) {
int first_unit = INT_MAX, last_unit = INT_MIN;
for_each_possible_cpu(cpu) {
int unit = unit_map[cpu];
BUG_ON(unit < 0);
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
/* the mapping should be one-to-one */
BUG_ON(unit_map[tcpu] == unit);
}
if (unit < first_unit) {
pcpu_first_unit_cpu = cpu;
first_unit = unit;
}
if (unit > last_unit) {
pcpu_last_unit_cpu = cpu;
last_unit = unit;
}
}
pcpu_nr_units = last_unit + 1;
pcpu_unit_map = unit_map;
} else {
int *identity_map;
/* #units == #cpus, identity mapped */
identity_map = alloc_bootmem(nr_cpu_ids *
sizeof(identity_map[0]));
for_each_possible_cpu(cpu)
identity_map[cpu] = cpu;
pcpu_first_unit_cpu = 0;
pcpu_last_unit_cpu = pcpu_nr_units - 1;
pcpu_nr_units = nr_cpu_ids;
pcpu_unit_map = identity_map;
}
/* determine basic parameters */
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = pcpu_nr_units * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
first_vm.addr = base_addr;
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
Tejun Heo
committed
/*
* Initialize static chunk. If reserved_size is zero, the
* static chunk covers static area + dynamic allocation area
* in the first chunk. If reserved_size is not zero, it
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
bitmap_fill(schunk->populated, pcpu_unit_pages);
Tejun Heo
committed
if (reserved_size) {
schunk->free_size = reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = static_size + reserved_size;
Tejun Heo
committed
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
schunk->map[schunk->map_used++] = -static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
Tejun Heo
committed
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(pcpu_chunk_struct_size);
Tejun Heo
committed
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
bitmap_fill(dchunk->populated, pcpu_unit_pages);
Tejun Heo
committed
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
pcpu_base_addr = schunk->vm->addr;
return pcpu_unit_size;
}
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
const char *pcpu_fc_names[PCPU_FC_NR] __initdata = {
[PCPU_FC_AUTO] = "auto",
[PCPU_FC_EMBED] = "embed",
[PCPU_FC_PAGE] = "page",
[PCPU_FC_LPAGE] = "lpage",
};
enum pcpu_fc pcpu_chosen_fc __initdata = PCPU_FC_AUTO;
static int __init percpu_alloc_setup(char *str)
{
if (0)
/* nada */;
#ifdef CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK
else if (!strcmp(str, "embed"))
pcpu_chosen_fc = PCPU_FC_EMBED;
#endif
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
else if (!strcmp(str, "page"))
pcpu_chosen_fc = PCPU_FC_PAGE;
#endif
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
else if (!strcmp(str, "lpage"))
pcpu_chosen_fc = PCPU_FC_LPAGE;
#endif
else
pr_warning("PERCPU: unknown allocator %s specified\n", str);
return 0;
}
early_param("percpu_alloc", percpu_alloc_setup);
#if defined(CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK) || \
!defined(CONFIG_HAVE_SETUP_PER_CPU_AREA)
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* as a contiguous area using bootmem allocator and used as-is without
* being mapped into vmalloc area. This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
* specified to fill page alignment. When @dyn_size is auto,
* @dyn_size is just big enough to fill page alignment after static
* and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t reserved_size, ssize_t dyn_size)
const size_t static_size = __per_cpu_end - __per_cpu_start;
size_t size_sum, unit_size, chunk_size;
void *base;
unsigned int cpu;
/* determine parameters and allocate */
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
chunk_size = unit_size * nr_cpu_ids;
base = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
if (!base) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
/* return the leftover and copy */
for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
if (cpu_possible(cpu)) {
free_bootmem(__pa(ptr + size_sum),
unit_size - size_sum);
memcpy(ptr, __per_cpu_load, static_size);
} else
free_bootmem(__pa(ptr), unit_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages/cpu @%p s%zu r%zu d%zu u%zu\n",
PFN_DOWN(size_sum), base, static_size, reserved_size, dyn_size,
unit_size);
return pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
unit_size, base, NULL);
#endif /* CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK ||
!CONFIG_HAVE_SETUP_PER_CPU_AREA */
#ifdef CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK
* pcpu_page_first_chunk - map the first chunk using PAGE_SIZE pages
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up page-remapped first percpu
* chunk and can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_page_first_chunk(size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
static struct vm_struct vm;
const size_t static_size = __per_cpu_end - __per_cpu_start;
ssize_t dyn_size = -1;
size_t size_sum, unit_size;
unsigned int cpu;
int i, j;
ssize_t ret;
snprintf(psize_str, sizeof(psize_str), "%luK", PAGE_SIZE >> 10);
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
unit_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
unit_pages = unit_size >> PAGE_SHIFT;
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(unit_pages * nr_cpu_ids * sizeof(pages[0]));
j = 0;
for_each_possible_cpu(cpu)
ptr = alloc_fn(cpu, PAGE_SIZE, PAGE_SIZE);
pr_warning("PERCPU: failed to allocate %s page "
"for cpu%u\n", psize_str, cpu);
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = nr_cpu_ids * unit_size;
vm_area_register_early(&vm, PAGE_SIZE);
for_each_possible_cpu(cpu) {
unsigned long unit_addr =
(unsigned long)vm.addr + cpu * unit_size;
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
ret = __pcpu_map_pages(unit_addr, &pages[cpu * unit_pages],
unit_pages);
if (ret < 0)
panic("failed to map percpu area, err=%zd\n", ret);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, static_size);
}
pr_info("PERCPU: %d %s pages/cpu @%p s%zu r%zu d%zu\n",
unit_pages, psize_str, vm.addr, static_size, reserved_size,
dyn_size);
ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
unit_size, vm.addr, NULL);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pages[j]), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
#endif /* CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK */
#ifdef CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK
struct pcpul_ent {
void *ptr;
};
static size_t pcpul_size;
static size_t pcpul_lpage_size;
static int pcpul_nr_lpages;
static struct pcpul_ent *pcpul_map;
static bool __init pcpul_unit_to_cpu(int unit, const int *unit_map,
unsigned int *cpup)
{
unsigned int cpu;
for_each_possible_cpu(cpu)
if (unit_map[cpu] == unit) {
if (cpup)
*cpup = cpu;
return true;
}
return false;
}
/**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes
* @unit_size: unit size in bytes
* @lpage_size: the size of a large page
* @unit_map: cpu -> unit mapping
* @nr_units: the number of units
* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
* @free_fn: function to free percpu memory, @size <= lpage_size
* @map_fn: function to map percpu lpage, always called with lpage_size
*
* This allocator uses large page to build and map the first chunk.
* Unlike other helpers, the caller should always specify @dyn_size
* and @unit_size. These parameters along with @unit_map and
* @nr_units can be determined using pcpu_lpage_build_unit_map().
* This two stage initialization is to allow arch code to evaluate the
* parameters before committing to it.
*
* Large pages are allocated as directed by @unit_map and other
* parameters and mapped to vmalloc space. Unused holes are returned
* to the page allocator. Note that these holes end up being actively
* mapped twice - once to the physical mapping and to the vmalloc area
* for the first percpu chunk. Depending on architecture, this might
* cause problem when changing page attributes of the returned area.
* These double mapped areas can be detected using
* pcpu_lpage_remapped().
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_lpage_first_chunk(size_t reserved_size, size_t dyn_size,
size_t unit_size, size_t lpage_size,
const int *unit_map, int nr_units,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
{
const size_t static_size = __per_cpu_end - __per_cpu_start;
size_t chunk_size = unit_size * nr_units;
size_t map_size;
unsigned int cpu;
ssize_t ret;
pcpul_lpage_dump_cfg(KERN_DEBUG, static_size, reserved_size, dyn_size,
unit_size, lpage_size, unit_map, nr_units);
BUG_ON(chunk_size % lpage_size);
pcpul_size = static_size + reserved_size + dyn_size;
pcpul_lpage_size = lpage_size;
pcpul_nr_lpages = chunk_size / lpage_size;
/* allocate pointer array and alloc large pages */
map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
pcpul_map = alloc_bootmem(map_size);
/* allocate all pages */
for (i = 0; i < pcpul_nr_lpages; i++) {
size_t offset = i * lpage_size;
int first_unit = offset / unit_size;
int last_unit = (offset + lpage_size - 1) / unit_size;
/* find out which cpu is mapped to this unit */
for (unit = first_unit; unit <= last_unit; unit++)
if (pcpul_unit_to_cpu(unit, unit_map, &cpu))
goto found;
continue;
found:
ptr = alloc_fn(cpu, lpage_size, lpage_size);
if (!ptr) {
pr_warning("PERCPU: failed to allocate large page "
"for cpu%u\n", cpu);
goto enomem;
}
pcpul_map[i].ptr = ptr;
}
/* return unused holes */
for (unit = 0; unit < nr_units; unit++) {
size_t start = unit * unit_size;
size_t end = start + unit_size;
size_t off, next;
/* don't free used part of occupied unit */
if (pcpul_unit_to_cpu(unit, unit_map, NULL))
start += pcpul_size;
/* unit can span more than one page, punch the holes */
for (off = start; off < end; off = next) {
void *ptr = pcpul_map[off / lpage_size].ptr;
next = min(roundup(off + 1, lpage_size), end);
if (ptr)
free_fn(ptr + off % lpage_size, next - off);
}
/* allocate address, map and copy */
vm.flags = VM_ALLOC;
vm.size = chunk_size;
vm_area_register_early(&vm, unit_size);
for (i = 0; i < pcpul_nr_lpages; i++) {
if (!pcpul_map[i].ptr)
continue;
pcpul_map[i].map_addr = vm.addr + i * lpage_size;
map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
}
for_each_possible_cpu(cpu)
memcpy(vm.addr + unit_map[cpu] * unit_size, __per_cpu_load,
static_size);
/* we're ready, commit */
pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
vm.addr, static_size, reserved_size, dyn_size, unit_size);
ret = pcpu_setup_first_chunk(static_size, reserved_size, dyn_size,
1946
1947
1948
1949
1950
1951
1952
1953
1954
1955
1956
1957
1958
1959
1960
1961
1962
1963
1964
1965
1966
1967
1968
unit_size, vm.addr, unit_map);
/*
* Sort pcpul_map array for pcpu_lpage_remapped(). Unmapped
* lpages are pushed to the end and trimmed.
*/
for (i = 0; i < pcpul_nr_lpages - 1; i++)
for (j = i + 1; j < pcpul_nr_lpages; j++) {
struct pcpul_ent tmp;
if (!pcpul_map[j].ptr)
continue;
if (pcpul_map[i].ptr &&
pcpul_map[i].ptr < pcpul_map[j].ptr)
continue;
tmp = pcpul_map[i];
pcpul_map[i] = pcpul_map[j];
pcpul_map[j] = tmp;
}
while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
pcpul_nr_lpages--;
return ret;
enomem:
for (i = 0; i < pcpul_nr_lpages; i++)
if (pcpul_map[i].ptr)
free_fn(pcpul_map[i].ptr, lpage_size);
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
free_bootmem(__pa(pcpul_map), map_size);
return -ENOMEM;
}
/**
* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
* @kaddr: the kernel address in question
*
* Determine whether @kaddr falls in the pcpul recycled area. This is
* used by pageattr to detect VM aliases and break up the pcpu large
* page mapping such that the same physical page is not mapped under
* different attributes.
*
* The recycled area is always at the tail of a partially used large
* page.
*
* RETURNS:
* Address of corresponding remapped pcpu address if match is found;
* otherwise, NULL.
*/
void *pcpu_lpage_remapped(void *kaddr)
{
unsigned long lpage_mask = pcpul_lpage_size - 1;
void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
unsigned long offset = (unsigned long)kaddr & lpage_mask;