Newer
Older
* it can also return pages after the static area. NULL return
* indicates end of pages for the cpu. Note that @get_page_fn() must
* return the same number of pages for all cpus.
*
Tejun Heo
committed
* @reserved_size, if non-zero, specifies the amount of bytes to
* reserve after the static area in the first chunk. This reserves
* the first chunk such that it's available only through reserved
* percpu allocation. This is primarily used to serve module percpu
* static areas on architectures where the addressing model has
* limited offset range for symbol relocations to guarantee module
* percpu symbols fall inside the relocatable range.
*
* @dyn_size, if non-negative, determines the number of bytes
* available for dynamic allocation in the first chunk. Specifying
* non-negative value makes percpu leave alone the area beyond
* @static_size + @reserved_size + @dyn_size.
*
* @unit_size, if non-negative, specifies unit size and must be
* aligned to PAGE_SIZE and equal to or larger than @static_size +
* @reserved_size + if non-negative, @dyn_size.
*
* Non-null @base_addr means that the caller already allocated virtual
* region for the first chunk and mapped it. percpu must not mess
* with the chunk. Note that @base_addr with 0 @unit_size or non-NULL
* @populate_pte_fn doesn't make any sense.
*
* @populate_pte_fn is used to populate the pagetable. NULL means the
* caller already populated the pagetable.
Tejun Heo
committed
* If the first chunk ends up with both reserved and dynamic areas, it
* is served by two chunks - one to serve the core static and reserved
* areas and the other for the dynamic area. They share the same vm
* and page map but uses different area allocation map to stay away
* from each other. The latter chunk is circulated in the chunk slots
* and available for dynamic allocation like any other chunks.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access.
*/
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
Tejun Heo
committed
size_t static_size, size_t reserved_size,
ssize_t dyn_size, ssize_t unit_size,
void *base_addr,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
static struct vm_struct first_vm;
Tejun Heo
committed
static int smap[2], dmap[2];
size_t size_sum = static_size + reserved_size +
(dyn_size >= 0 ? dyn_size : 0);
Tejun Heo
committed
struct pcpu_chunk *schunk, *dchunk = NULL;
int nr_pages;
/* santiy checks */
Tejun Heo
committed
BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
BUG_ON(!static_size);
if (unit_size >= 0) {
BUG_ON(unit_size < size_sum);
BUG_ON(unit_size & ~PAGE_MASK);
BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
} else
BUG_ON(base_addr);
BUG_ON(base_addr && populate_pte_fn);
if (unit_size >= 0)
pcpu_unit_pages = unit_size >> PAGE_SHIFT;
else
pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
PFN_UP(size_sum));
pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);
if (dyn_size < 0)
Tejun Heo
committed
dyn_size = pcpu_unit_size - static_size - reserved_size;
/*
* Allocate chunk slots. The additional last slot is for
* empty chunks.
*/
pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
for (i = 0; i < pcpu_nr_slots; i++)
INIT_LIST_HEAD(&pcpu_slot[i]);
Tejun Heo
committed
/*
* Initialize static chunk. If reserved_size is zero, the
* static chunk covers static area + dynamic allocation area
* in the first chunk. If reserved_size is not zero, it
* covers static area + reserved area (mostly used for module
* static percpu allocation).
*/
schunk = alloc_bootmem(pcpu_chunk_struct_size);
INIT_LIST_HEAD(&schunk->list);
schunk->vm = &first_vm;
schunk->map = smap;
schunk->map_alloc = ARRAY_SIZE(smap);
schunk->page = schunk->page_ar;
Tejun Heo
committed
if (reserved_size) {
schunk->free_size = reserved_size;
pcpu_reserved_chunk = schunk;
pcpu_reserved_chunk_limit = static_size + reserved_size;
Tejun Heo
committed
} else {
schunk->free_size = dyn_size;
dyn_size = 0; /* dynamic area covered */
}
schunk->contig_hint = schunk->free_size;
schunk->map[schunk->map_used++] = -static_size;
if (schunk->free_size)
schunk->map[schunk->map_used++] = schunk->free_size;
Tejun Heo
committed
/* init dynamic chunk if necessary */
if (dyn_size) {
dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
INIT_LIST_HEAD(&dchunk->list);
dchunk->vm = &first_vm;
dchunk->map = dmap;
dchunk->map_alloc = ARRAY_SIZE(dmap);
dchunk->page = schunk->page_ar; /* share page map with schunk */
dchunk->contig_hint = dchunk->free_size = dyn_size;
dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
dchunk->map[dchunk->map_used++] = dchunk->free_size;
}
/* allocate vm address */
first_vm.flags = VM_ALLOC;
first_vm.size = pcpu_chunk_size;
if (!base_addr)
vm_area_register_early(&first_vm, PAGE_SIZE);
else {
/*
* Pages already mapped. No need to remap into
Tejun Heo
committed
* vmalloc area. In this case the first chunks can't
* be mapped or unmapped by percpu and are marked
* immutable.
*/
first_vm.addr = base_addr;
schunk->immutable = true;
Tejun Heo
committed
if (dchunk)
dchunk->immutable = true;
}
/* assign pages */
nr_pages = -1;
for_each_possible_cpu(cpu) {
for (i = 0; i < pcpu_unit_pages; i++) {
struct page *page = get_page_fn(cpu, i);
if (!page)
break;
*pcpu_chunk_pagep(schunk, cpu, i) = page;
BUG_ON(i < PFN_UP(static_size));
if (nr_pages < 0)
nr_pages = i;
else
BUG_ON(nr_pages != i);
/* map them */
if (populate_pte_fn) {
for_each_possible_cpu(cpu)
for (i = 0; i < nr_pages; i++)
populate_pte_fn(pcpu_chunk_addr(schunk,
cpu, i));
err = pcpu_map(schunk, 0, nr_pages);
if (err)
panic("failed to setup static percpu area, err=%d\n",
err);
}
/* link the first chunk in */
pcpu_first_chunk = dchunk ?: schunk;
pcpu_chunk_relocate(pcpu_first_chunk, -1);
pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
return pcpu_unit_size;
}
static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
ssize_t *dyn_sizep)
{
size_t size_sum;
size_sum = PFN_ALIGN(static_size + reserved_size +
(*dyn_sizep >= 0 ? *dyn_sizep : 0));
if (*dyn_sizep != 0)
*dyn_sizep = size_sum - static_size - reserved_size;
return size_sum;
}
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
/*
* Embedding first chunk setup helper.
*/
static void *pcpue_ptr __initdata;
static size_t pcpue_size __initdata;
static size_t pcpue_unit_size __initdata;
static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpue_size)
return NULL;
return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
}
/**
* pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* If this function is used to setup the first chunk, it is allocated
* as a contiguous area using bootmem allocator and used as-is without
* being mapped into vmalloc area. This enables the first chunk to
* piggy back on the linear physical mapping which often uses larger
* page size.
*
* When @dyn_size is positive, dynamic area might be larger than
* specified to fill page alignment. When @dyn_size is auto,
* @dyn_size is just big enough to fill page alignment after static
* and reserved areas.
*
* If the needed size is smaller than the minimum or specified unit
* size, the leftover is returned to the bootmem allocator.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
unsigned int cpu;
/* determine parameters and allocate */
pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
chunk_size = pcpue_unit_size * num_possible_cpus();
pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
__pa(MAX_DMA_ADDRESS));
if (!pcpue_ptr) {
pr_warning("PERCPU: failed to allocate %zu bytes for "
"embedding\n", chunk_size);
/* return the leftover and copy */
for_each_possible_cpu(cpu) {
void *ptr = pcpue_ptr + cpu * pcpue_unit_size;
free_bootmem(__pa(ptr + pcpue_size),
pcpue_unit_size - pcpue_size);
memcpy(ptr, __per_cpu_load, static_size);
}
/* we're ready, commit */
pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);
return pcpu_setup_first_chunk(pcpue_get_page, static_size,
reserved_size, dyn_size,
pcpue_unit_size, pcpue_ptr, NULL);
}
/*
* 4k page first chunk setup helper.
*/
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_unit_pages __initdata;
static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
if (pageno < pcpu4k_unit_pages)
return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
return NULL;
}
/**
* pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
* @free_fn: funtion to free percpu page, always called with PAGE_SIZE
* @populate_pte_fn: function to populate pte
*
* This is a helper to ease setting up embedded first percpu chunk and
* can be called where pcpu_setup_first_chunk() is expected.
*
* This is the basic allocator. Static percpu area is allocated
* page-by-page into vmalloc area.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
static struct vm_struct vm;
size_t pages_size;
unsigned int cpu;
int i, j;
ssize_t ret;
pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
PCPU_MIN_UNIT_SIZE));
/* unaligned allocations can't be freed, round up to page size */
pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() *
sizeof(pcpu4k_pages[0]));
pcpu4k_pages = alloc_bootmem(pages_size);
j = 0;
for_each_possible_cpu(cpu)
for (i = 0; i < pcpu4k_unit_pages; i++) {
void *ptr;
ptr = alloc_fn(cpu, PAGE_SIZE);
if (!ptr) {
pr_warning("PERCPU: failed to allocate "
"4k page for cpu%u\n", cpu);
goto enomem;
}
pcpu4k_pages[j++] = virt_to_page(ptr);
}
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
/* allocate vm area, map the pages and copy static data */
vm.flags = VM_ALLOC;
vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT;
vm_area_register_early(&vm, PAGE_SIZE);
for_each_possible_cpu(cpu) {
unsigned long unit_addr = (unsigned long)vm.addr +
(cpu * pcpu4k_unit_pages << PAGE_SHIFT);
for (i = 0; i < pcpu4k_unit_pages; i++)
populate_pte_fn(unit_addr + (i << PAGE_SHIFT));
/* pte already populated, the following shouldn't fail */
ret = __pcpu_map_pages(unit_addr,
&pcpu4k_pages[cpu * pcpu4k_unit_pages],
pcpu4k_unit_pages);
if (ret < 0)
panic("failed to map percpu area, err=%zd\n", ret);
/*
* FIXME: Archs with virtual cache should flush local
* cache for the linear mapping here - something
* equivalent to flush_cache_vmap() on the local cpu.
* flush_cache_vmap() can't be used as most supporting
* data structures are not set up yet.
*/
/* copy static data */
memcpy((void *)unit_addr, __per_cpu_load, static_size);
}
pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
pcpu4k_unit_pages, static_size);
ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
reserved_size, -1,
pcpu4k_unit_pages << PAGE_SHIFT, vm.addr,
NULL);
goto out_free_ar;
enomem:
while (--j >= 0)
free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
ret = -ENOMEM;
out_free_ar:
free_bootmem(__pa(pcpu4k_pages), pages_size);
return ret;
}
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
/*
* Large page remapping first chunk setup helper
*/
#ifdef CONFIG_NEED_MULTIPLE_NODES
struct pcpul_ent {
unsigned int cpu;
void *ptr;
};
static size_t pcpul_size;
static size_t pcpul_unit_size;
static struct pcpul_ent *pcpul_map;
static struct vm_struct pcpul_vm;
static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
{
size_t off = (size_t)pageno << PAGE_SHIFT;
if (off >= pcpul_size)
return NULL;
return virt_to_page(pcpul_map[cpu].ptr + off);
}
/**
* pcpu_lpage_first_chunk - remap the first percpu chunk using large page
* @static_size: the size of static percpu area in bytes
* @reserved_size: the size of reserved percpu area in bytes
* @dyn_size: free size for dynamic allocation in bytes, -1 for auto
* @lpage_size: the size of a large page
* @alloc_fn: function to allocate percpu lpage, always called with lpage_size
* @free_fn: function to free percpu memory, @size <= lpage_size
* @map_fn: function to map percpu lpage, always called with lpage_size
*
* This allocator uses large page as unit. A large page is allocated
* for each cpu and each is remapped into vmalloc area using large
* page mapping. As large page can be quite large, only part of it is
* used for the first chunk. Unused part is returned to the bootmem
* allocator.
*
* So, the large pages are mapped twice - once to the physical mapping
* and to the vmalloc area for the first percpu chunk. The double
* mapping does add one more large TLB entry pressure but still is
* much better than only using 4k mappings while still being NUMA
* friendly.
*
* RETURNS:
* The determined pcpu_unit_size which can be used to initialize
* percpu access on success, -errno on failure.
*/
ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
ssize_t dyn_size, size_t lpage_size,
pcpu_fc_alloc_fn_t alloc_fn,
pcpu_fc_free_fn_t free_fn,
pcpu_fc_map_fn_t map_fn)
{
size_t size_sum;
size_t map_size;
unsigned int cpu;
int i, j;
ssize_t ret;
/*
* Currently supports only single page. Supporting multiple
* pages won't be too difficult if it ever becomes necessary.
*/
size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);
pcpul_unit_size = lpage_size;
pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
if (pcpul_size > pcpul_unit_size) {
pr_warning("PERCPU: static data is larger than large page, "
"can't use large page\n");
return -EINVAL;
}
/* allocate pointer array and alloc large pages */
map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
pcpul_map = alloc_bootmem(map_size);
for_each_possible_cpu(cpu) {
void *ptr;
ptr = alloc_fn(cpu, lpage_size);
if (!ptr) {
pr_warning("PERCPU: failed to allocate large page "
"for cpu%u\n", cpu);
goto enomem;
}
/*
* Only use pcpul_size bytes and give back the rest.
*
* Ingo: The lpage_size up-rounding bootmem is needed
* to make sure the partial lpage is still fully RAM -
* it's not well-specified to have a incompatible area
* (unmapped RAM, device memory, etc.) in that hole.
*/
free_fn(ptr + pcpul_size, lpage_size - pcpul_size);
pcpul_map[cpu].cpu = cpu;
pcpul_map[cpu].ptr = ptr;
memcpy(ptr, __per_cpu_load, static_size);
}
/* allocate address and map */
pcpul_vm.flags = VM_ALLOC;
pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
vm_area_register_early(&pcpul_vm, pcpul_unit_size);
for_each_possible_cpu(cpu)
map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
pcpul_vm.addr + cpu * pcpul_unit_size);
/* we're ready, commit */
pr_info("PERCPU: Remapped at %p with large pages, static data "
"%zu bytes\n", pcpul_vm.addr, static_size);
ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
reserved_size, dyn_size, pcpul_unit_size,
pcpul_vm.addr, NULL);
/* sort pcpul_map array for pcpu_lpage_remapped() */
for (i = 0; i < num_possible_cpus() - 1; i++)
for (j = i + 1; j < num_possible_cpus(); j++)
if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
struct pcpul_ent tmp = pcpul_map[i];
pcpul_map[i] = pcpul_map[j];
pcpul_map[j] = tmp;
}
return ret;
enomem:
for_each_possible_cpu(cpu)
if (pcpul_map[cpu].ptr)
free_fn(pcpul_map[cpu].ptr, pcpul_size);
free_bootmem(__pa(pcpul_map), map_size);
return -ENOMEM;
}
/**
* pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
* @kaddr: the kernel address in question
*
* Determine whether @kaddr falls in the pcpul recycled area. This is
* used by pageattr to detect VM aliases and break up the pcpu large
* page mapping such that the same physical page is not mapped under
* different attributes.
*
* The recycled area is always at the tail of a partially used large
* page.
*
* RETURNS:
* Address of corresponding remapped pcpu address if match is found;
* otherwise, NULL.
*/
void *pcpu_lpage_remapped(void *kaddr)
{
unsigned long unit_mask = pcpul_unit_size - 1;
void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
unsigned long offset = (unsigned long)kaddr & unit_mask;
int left = 0, right = num_possible_cpus() - 1;
int pos;
/* pcpul in use at all? */
if (!pcpul_map)
return NULL;
/* okay, perform binary search */
while (left <= right) {
pos = (left + right) / 2;
if (pcpul_map[pos].ptr < lpage_addr)
left = pos + 1;
else if (pcpul_map[pos].ptr > lpage_addr)
right = pos - 1;
else {
/* it shouldn't be in the area for the first chunk */
WARN_ON(offset < pcpul_size);
return pcpul_vm.addr +
pcpul_map[pos].cpu * pcpul_unit_size + offset;
}
}
return NULL;
}
#endif
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
/*
* Generic percpu area setup.
*
* The embedding helper is used because its behavior closely resembles
* the original non-dynamic generic percpu area setup. This is
* important because many archs have addressing restrictions and might
* fail if the percpu area is located far away from the previous
* location. As an added bonus, in non-NUMA cases, embedding is
* generally a good idea TLB-wise because percpu area can piggy back
* on the physical linear memory mapping which uses large page
* mappings on applicable archs.
*/
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);
void __init setup_per_cpu_areas(void)
{
size_t static_size = __per_cpu_end - __per_cpu_start;
ssize_t unit_size;
unsigned long delta;
unsigned int cpu;
/*
* Always reserve area for module percpu variables. That's
* what the legacy allocator did.
*/
unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
PERCPU_DYNAMIC_RESERVE);
if (unit_size < 0)
panic("Failed to initialized percpu areas.");
delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
for_each_possible_cpu(cpu)
__per_cpu_offset[cpu] = delta + cpu * unit_size;
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */