Newer
Older
/*
* linux/mm/percpu.c - percpu memory allocator
*
* Copyright (C) 2009 SUSE Linux Products GmbH
* Copyright (C) 2009 Tejun Heo <tj@kernel.org>
*
* This file is released under the GPLv2.
*
* This is percpu allocator which can handle both static and dynamic
* areas. Percpu areas are allocated in chunks in vmalloc area. Each
* chunk is consisted of boot-time determined number of units and the
* first chunk is used for static percpu variables in the kernel image
* (special boot time alloc/init handling necessary as these areas
* need to be brought up before allocation services are running).
* Unit grows as necessary and all units grow or shrink in unison.
* When a chunk is filled up, another chunk is allocated. ie. in
* vmalloc area
*
* c0 c1 c2
* ------------------- ------------------- ------------
* | u0 | u1 | u2 | u3 | | u0 | u1 | u2 | u3 | | u0 | u1 | u
* ------------------- ...... ------------------- .... ------------
*
* Allocation is done in offset-size areas of single unit space. Ie,
* an area of 512 bytes at 6k in c1 occupies 512 bytes at 6k of c1:u0,
* c1:u1, c1:u2 and c1:u3. On UMA, units corresponds directly to
* cpus. On NUMA, the mapping can be non-linear and even sparse.
* Percpu access can be done by configuring percpu base registers
* according to cpu to unit mapping and pcpu_unit_size.
* There are usually many small percpu allocations many of them being
* as small as 4 bytes. The allocator organizes chunks into lists
* according to free size and tries to allocate from the fullest one.
* Each chunk keeps the maximum contiguous area size hint which is
* guaranteed to be eqaul to or larger than the maximum contiguous
* area in the chunk. This helps the allocator not to iterate the
* chunk maps unnecessarily.
*
* Allocation state in each chunk is kept using an array of integers
* on chunk->map. A positive value in the map represents a free
* region and negative allocated. Allocation inside a chunk is done
* by scanning this map sequentially and serving the first matching
* entry. This is mostly copied from the percpu_modalloc() allocator.
* Chunks can be determined from the address using the index field
* in the page struct. The index field contains a pointer to the chunk.
*
* To use this allocator, arch code should do the followings.
*
* - drop CONFIG_HAVE_LEGACY_PER_CPU_AREA
*
* - define __addr_to_pcpu_ptr() and __pcpu_ptr_to_addr() to translate
* regular address to percpu pointer and back if they need to be
* different from the default
* - use pcpu_setup_first_chunk() during percpu area initialization to
* setup the first chunk containing the kernel static percpu area
*/
#include <linux/bitmap.h>
#include <linux/bootmem.h>
#include <linux/mm.h>
#include <linux/module.h>
#include <linux/mutex.h>
#include <linux/percpu.h>
#include <linux/pfn.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <asm/cacheflush.h>
#include <asm/sections.h>
#include <asm/tlbflush.h>
#define PCPU_SLOT_BASE_SHIFT 5 /* 1-31 shares the same slot */
#define PCPU_DFL_MAP_ALLOC 16 /* start a map with 16 ents */
/* default addr <-> pcpu_ptr mapping, override in asm/percpu.h if necessary */
#ifndef __addr_to_pcpu_ptr
#define __addr_to_pcpu_ptr(addr) \
(void *)((unsigned long)(addr) - (unsigned long)pcpu_base_addr \
+ (unsigned long)__per_cpu_start)
#endif
#ifndef __pcpu_ptr_to_addr
#define __pcpu_ptr_to_addr(ptr) \
(void *)((unsigned long)(ptr) + (unsigned long)pcpu_base_addr \
- (unsigned long)__per_cpu_start)
#endif
struct pcpu_chunk {
struct list_head list; /* linked to pcpu_slot lists */
int free_size; /* free bytes in the chunk */
int contig_hint; /* max contiguous size hint */
struct vm_struct *vm; /* mapped vmalloc region */
int map_used; /* # of map entries used */
int map_alloc; /* # of map entries allocated */
int *map; /* allocation map */
bool immutable; /* no [de]population allowed */
unsigned long populated[]; /* populated bitmap */
static int pcpu_unit_pages __read_mostly;
static int pcpu_unit_size __read_mostly;
static int pcpu_nr_units __read_mostly;
static int pcpu_chunk_size __read_mostly;
static int pcpu_nr_slots __read_mostly;
static size_t pcpu_chunk_struct_size __read_mostly;
/* cpus with the lowest and highest unit numbers */
static unsigned int pcpu_first_unit_cpu __read_mostly;
static unsigned int pcpu_last_unit_cpu __read_mostly;
/* the address of the first chunk which starts with the kernel static area */
void *pcpu_base_addr __read_mostly;
EXPORT_SYMBOL_GPL(pcpu_base_addr);
/* cpu -> unit map */
const int *pcpu_unit_map __read_mostly;
/*
* The first chunk which always exists. Note that unlike other
* chunks, this one can be allocated and mapped in several different
* ways and thus often doesn't live in the vmalloc area.
*/
static struct pcpu_chunk *pcpu_first_chunk;
/*
* Optional reserved chunk. This chunk reserves part of the first
* chunk and serves it for reserved allocations. The amount of
* reserved offset is in pcpu_reserved_chunk_limit. When reserved
* area doesn't exist, the following variables contain NULL and 0
* respectively.
*/
Tejun Heo
committed
static struct pcpu_chunk *pcpu_reserved_chunk;
static int pcpu_reserved_chunk_limit;
* Synchronization rules.
*
* There are two locks - pcpu_alloc_mutex and pcpu_lock. The former
* protects allocation/reclaim paths, chunks, populated bitmap and
* vmalloc mapping. The latter is a spinlock and protects the index
* data structures - chunk slots, chunks and area maps in chunks.
*
* During allocation, pcpu_alloc_mutex is kept locked all the time and
* pcpu_lock is grabbed and released as necessary. All actual memory
* allocations are done using GFP_KERNEL with pcpu_lock released.
*
* Free path accesses and alters only the index data structures, so it
* can be safely called from atomic context. When memory needs to be
* returned to the system, free path schedules reclaim_work which
* grabs both pcpu_alloc_mutex and pcpu_lock, unlinks chunks to be
* reclaimed, release both locks and frees the chunks. Note that it's
* necessary to grab both locks to remove a chunk from circulation as
* allocation path might be referencing the chunk with only
* pcpu_alloc_mutex locked.
static DEFINE_MUTEX(pcpu_alloc_mutex); /* protects whole alloc and reclaim */
static DEFINE_SPINLOCK(pcpu_lock); /* protects index data structures */
static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
/* reclaim work to release fully free chunks, scheduled from free path */
static void pcpu_reclaim(struct work_struct *work);
static DECLARE_WORK(pcpu_reclaim_work, pcpu_reclaim);
static int __pcpu_size_to_slot(int size)
return max(highbit - PCPU_SLOT_BASE_SHIFT + 2, 1);
}
static int pcpu_size_to_slot(int size)
{
if (size == pcpu_unit_size)
return pcpu_nr_slots - 1;
return __pcpu_size_to_slot(size);
}
static int pcpu_chunk_slot(const struct pcpu_chunk *chunk)
{
if (chunk->free_size < sizeof(int) || chunk->contig_hint < sizeof(int))
return 0;
return pcpu_size_to_slot(chunk->free_size);
}
static int pcpu_page_idx(unsigned int cpu, int page_idx)
{
return pcpu_unit_map[cpu] * pcpu_unit_pages + page_idx;
}
static unsigned long pcpu_chunk_addr(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
{
return (unsigned long)chunk->vm->addr +
(pcpu_page_idx(cpu, page_idx) << PAGE_SHIFT);
}
static struct page *pcpu_chunk_page(struct pcpu_chunk *chunk,
unsigned int cpu, int page_idx)
/* must not be used on pre-mapped chunk */
WARN_ON(chunk->immutable);
return vmalloc_to_page((void *)pcpu_chunk_addr(chunk, cpu, page_idx));
/* set the pointer to a chunk in a page struct */
static void pcpu_set_page_chunk(struct page *page, struct pcpu_chunk *pcpu)
{
page->index = (unsigned long)pcpu;
}
/* obtain pointer to a chunk from a page struct */
static struct pcpu_chunk *pcpu_get_page_chunk(struct page *page)
{
return (struct pcpu_chunk *)page->index;
}
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
static void pcpu_next_unpop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
{
*rs = find_next_zero_bit(chunk->populated, end, *rs);
*re = find_next_bit(chunk->populated, end, *rs + 1);
}
static void pcpu_next_pop(struct pcpu_chunk *chunk, int *rs, int *re, int end)
{
*rs = find_next_bit(chunk->populated, end, *rs);
*re = find_next_zero_bit(chunk->populated, end, *rs + 1);
}
/*
* (Un)populated page region iterators. Iterate over (un)populated
* page regions betwen @start and @end in @chunk. @rs and @re should
* be integer variables and will be set to start and end page index of
* the current region.
*/
#define pcpu_for_each_unpop_region(chunk, rs, re, start, end) \
for ((rs) = (start), pcpu_next_unpop((chunk), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_unpop((chunk), &(rs), &(re), (end)))
#define pcpu_for_each_pop_region(chunk, rs, re, start, end) \
for ((rs) = (start), pcpu_next_pop((chunk), &(rs), &(re), (end)); \
(rs) < (re); \
(rs) = (re) + 1, pcpu_next_pop((chunk), &(rs), &(re), (end)))
* pcpu_mem_alloc - allocate memory
* @size: bytes to allocate
* Allocate @size bytes. If @size is smaller than PAGE_SIZE,
* kzalloc() is used; otherwise, vmalloc() is used. The returned
* memory is always zeroed.
* CONTEXT:
* Does GFP_KERNEL allocation.
*
* Pointer to the allocated area on success, NULL on failure.
static void *pcpu_mem_alloc(size_t size)
if (size <= PAGE_SIZE)
return kzalloc(size, GFP_KERNEL);
else {
void *ptr = vmalloc(size);
if (ptr)
memset(ptr, 0, size);
return ptr;
}
}
/**
* pcpu_mem_free - free memory
* @ptr: memory to free
* @size: size of the area
*
* Free @ptr. @ptr should have been allocated using pcpu_mem_alloc().
*/
static void pcpu_mem_free(void *ptr, size_t size)
{
kfree(ptr);
vfree(ptr);
}
/**
* pcpu_chunk_relocate - put chunk in the appropriate chunk slot
* @chunk: chunk of interest
* @oslot: the previous slot it was on
*
* This function is called after an allocation or free changed @chunk.
* New slot according to the changed state is determined and @chunk is
Tejun Heo
committed
* moved to the slot. Note that the reserved chunk is never put on
* chunk slots.
*
* CONTEXT:
* pcpu_lock.
*/
static void pcpu_chunk_relocate(struct pcpu_chunk *chunk, int oslot)
{
int nslot = pcpu_chunk_slot(chunk);
Tejun Heo
committed
if (chunk != pcpu_reserved_chunk && oslot != nslot) {
if (oslot < nslot)
list_move(&chunk->list, &pcpu_slot[nslot]);
else
list_move_tail(&chunk->list, &pcpu_slot[nslot]);
}
}
/**
* pcpu_chunk_addr_search - determine chunk containing specified address
* @addr: address for which the chunk needs to be determined.
* RETURNS:
* The address of the found chunk.
*/
static struct pcpu_chunk *pcpu_chunk_addr_search(void *addr)
{
void *first_start = pcpu_first_chunk->vm->addr;
/* is it in the first chunk? */
if (addr >= first_start && addr < first_start + pcpu_unit_size) {
/* is it in the reserved area? */
if (addr < first_start + pcpu_reserved_chunk_limit)
Tejun Heo
committed
return pcpu_reserved_chunk;
return pcpu_first_chunk;
Tejun Heo
committed
}
/*
* The address is relative to unit0 which might be unused and
* thus unmapped. Offset the address to the unit space of the
* current processor before looking it up in the vmalloc
* space. Note that any possible cpu id can be used here, so
* there's no need to worry about preemption or cpu hotplug.
*/
addr += pcpu_unit_map[smp_processor_id()] * pcpu_unit_size;
return pcpu_get_page_chunk(vmalloc_to_page(addr));
/**
* pcpu_extend_area_map - extend area map for allocation
* @chunk: target chunk
*
* Extend area map of @chunk so that it can accomodate an allocation.
* A single allocation can split an area into three areas, so this
* function makes sure that @chunk->map has at least two extra slots.
*
* CONTEXT:
* pcpu_alloc_mutex, pcpu_lock. pcpu_lock is released and reacquired
* if area map is extended.
*
* RETURNS:
* 0 if noop, 1 if successfully extended, -errno on failure.
*/
static int pcpu_extend_area_map(struct pcpu_chunk *chunk)
{
int new_alloc;
int *new;
size_t size;
/* has enough? */
if (chunk->map_alloc >= chunk->map_used + 2)
return 0;
spin_unlock_irq(&pcpu_lock);
new_alloc = PCPU_DFL_MAP_ALLOC;
while (new_alloc < chunk->map_used + 2)
new_alloc *= 2;
new = pcpu_mem_alloc(new_alloc * sizeof(new[0]));
if (!new) {
spin_lock_irq(&pcpu_lock);
return -ENOMEM;
}
/*
* Acquire pcpu_lock and switch to new area map. Only free
* could have happened inbetween, so map_used couldn't have
* grown.
*/
spin_lock_irq(&pcpu_lock);
BUG_ON(new_alloc < chunk->map_used + 2);
size = chunk->map_alloc * sizeof(chunk->map[0]);
memcpy(new, chunk->map, size);
/*
* map_alloc < PCPU_DFL_MAP_ALLOC indicates that the chunk is
* one of the first chunks and still using static map.
*/
if (chunk->map_alloc >= PCPU_DFL_MAP_ALLOC)
pcpu_mem_free(chunk->map, size);
chunk->map_alloc = new_alloc;
chunk->map = new;
return 0;
}
/**
* pcpu_split_block - split a map block
* @chunk: chunk of interest
* @i: index of map block to split
* @head: head size in bytes (can be 0)
* @tail: tail size in bytes (can be 0)
*
* Split the @i'th map block into two or three blocks. If @head is
* non-zero, @head bytes block is inserted before block @i moving it
* to @i+1 and reducing its size by @head bytes.
*
* If @tail is non-zero, the target block, which can be @i or @i+1
* depending on @head, is reduced by @tail bytes and @tail byte block
* is inserted after the target block.
*
* @chunk->map must have enough free slots to accomodate the split.
*
* CONTEXT:
* pcpu_lock.
static void pcpu_split_block(struct pcpu_chunk *chunk, int i,
int head, int tail)
{
int nr_extra = !!head + !!tail;
BUG_ON(chunk->map_alloc < chunk->map_used + nr_extra);
/* insert new subblocks */
memmove(&chunk->map[i + nr_extra], &chunk->map[i],
sizeof(chunk->map[0]) * (chunk->map_used - i));
chunk->map_used += nr_extra;
if (head) {
chunk->map[i + 1] = chunk->map[i] - head;
chunk->map[i++] = head;
}
if (tail) {
chunk->map[i++] -= tail;
chunk->map[i] = tail;
}
}
/**
* pcpu_alloc_area - allocate area from a pcpu_chunk
* @chunk: chunk of interest
* @align: wanted align
*
* Try to allocate @size bytes area aligned at @align from @chunk.
* Note that this function only allocates the offset. It doesn't
* populate or map the area.
*
* @chunk->map must have at least two free slots.
*
* CONTEXT:
* pcpu_lock.
*
* Allocated offset in @chunk on success, -1 if no matching area is
* found.
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
*/
static int pcpu_alloc_area(struct pcpu_chunk *chunk, int size, int align)
{
int oslot = pcpu_chunk_slot(chunk);
int max_contig = 0;
int i, off;
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++])) {
bool is_last = i + 1 == chunk->map_used;
int head, tail;
/* extra for alignment requirement */
head = ALIGN(off, align) - off;
BUG_ON(i == 0 && head != 0);
if (chunk->map[i] < 0)
continue;
if (chunk->map[i] < head + size) {
max_contig = max(chunk->map[i], max_contig);
continue;
}
/*
* If head is small or the previous block is free,
* merge'em. Note that 'small' is defined as smaller
* than sizeof(int), which is very small but isn't too
* uncommon for percpu allocations.
*/
if (head && (head < sizeof(int) || chunk->map[i - 1] > 0)) {
if (chunk->map[i - 1] > 0)
chunk->map[i - 1] += head;
else {
chunk->map[i - 1] -= head;
chunk->free_size -= head;
}
chunk->map[i] -= head;
off += head;
head = 0;
}
/* if tail is small, just keep it around */
tail = chunk->map[i] - head - size;
if (tail < sizeof(int))
tail = 0;
/* split if warranted */
if (head || tail) {
pcpu_split_block(chunk, i, head, tail);
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
if (head) {
i++;
off += head;
max_contig = max(chunk->map[i - 1], max_contig);
}
if (tail)
max_contig = max(chunk->map[i + 1], max_contig);
}
/* update hint and mark allocated */
if (is_last)
chunk->contig_hint = max_contig; /* fully scanned */
else
chunk->contig_hint = max(chunk->contig_hint,
max_contig);
chunk->free_size -= chunk->map[i];
chunk->map[i] = -chunk->map[i];
pcpu_chunk_relocate(chunk, oslot);
return off;
}
chunk->contig_hint = max_contig; /* fully scanned */
pcpu_chunk_relocate(chunk, oslot);
/* tell the upper layer that this chunk has no matching area */
return -1;
}
/**
* pcpu_free_area - free area to a pcpu_chunk
* @chunk: chunk of interest
* @freeme: offset of area to free
*
* Free area starting from @freeme to @chunk. Note that this function
* only modifies the allocation map. It doesn't depopulate or unmap
* the area.
*
* CONTEXT:
* pcpu_lock.
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
*/
static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme)
{
int oslot = pcpu_chunk_slot(chunk);
int i, off;
for (i = 0, off = 0; i < chunk->map_used; off += abs(chunk->map[i++]))
if (off == freeme)
break;
BUG_ON(off != freeme);
BUG_ON(chunk->map[i] > 0);
chunk->map[i] = -chunk->map[i];
chunk->free_size += chunk->map[i];
/* merge with previous? */
if (i > 0 && chunk->map[i - 1] >= 0) {
chunk->map[i - 1] += chunk->map[i];
chunk->map_used--;
memmove(&chunk->map[i], &chunk->map[i + 1],
(chunk->map_used - i) * sizeof(chunk->map[0]));
i--;
}
/* merge with next? */
if (i + 1 < chunk->map_used && chunk->map[i + 1] >= 0) {
chunk->map[i] += chunk->map[i + 1];
chunk->map_used--;
memmove(&chunk->map[i + 1], &chunk->map[i + 2],
(chunk->map_used - (i + 1)) * sizeof(chunk->map[0]));
}
chunk->contig_hint = max(chunk->map[i], chunk->contig_hint);
pcpu_chunk_relocate(chunk, oslot);
}
/**
* pcpu_get_pages_and_bitmap - get temp pages array and bitmap
* @chunk: chunk of interest
* @bitmapp: output parameter for bitmap
* @may_alloc: may allocate the array
*
* Returns pointer to array of pointers to struct page and bitmap,
* both of which can be indexed with pcpu_page_idx(). The returned
* array is cleared to zero and *@bitmapp is copied from
* @chunk->populated. Note that there is only one array and bitmap
* and access exclusion is the caller's responsibility.
*
* CONTEXT:
* pcpu_alloc_mutex and does GFP_KERNEL allocation if @may_alloc.
* Otherwise, don't care.
*
* RETURNS:
* Pointer to temp pages array on success, NULL on failure.
*/
static struct page **pcpu_get_pages_and_bitmap(struct pcpu_chunk *chunk,
unsigned long **bitmapp,
bool may_alloc)
{
static struct page **pages;
static unsigned long *bitmap;
size_t pages_size = pcpu_nr_units * pcpu_unit_pages * sizeof(pages[0]);
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
size_t bitmap_size = BITS_TO_LONGS(pcpu_unit_pages) *
sizeof(unsigned long);
if (!pages || !bitmap) {
if (may_alloc && !pages)
pages = pcpu_mem_alloc(pages_size);
if (may_alloc && !bitmap)
bitmap = pcpu_mem_alloc(bitmap_size);
if (!pages || !bitmap)
return NULL;
}
memset(pages, 0, pages_size);
bitmap_copy(bitmap, chunk->populated, pcpu_unit_pages);
*bitmapp = bitmap;
return pages;
}
/**
* pcpu_free_pages - free pages which were allocated for @chunk
* @chunk: chunk pages were allocated for
* @pages: array of pages to be freed, indexed by pcpu_page_idx()
* @populated: populated bitmap
* @page_start: page index of the first page to be freed
* @page_end: page index of the last page to be freed + 1
*
* Free pages [@page_start and @page_end) in @pages for all units.
* The pages were allocated for @chunk.
*/
static void pcpu_free_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page = pages[pcpu_page_idx(cpu, i)];
if (page)
__free_page(page);
}
}
}
/**
* pcpu_alloc_pages - allocates pages for @chunk
* @chunk: target chunk
* @pages: array to put the allocated pages into, indexed by pcpu_page_idx()
* @populated: populated bitmap
* @page_start: page index of the first page to be allocated
* @page_end: page index of the last page to be allocated + 1
*
* Allocate pages [@page_start,@page_end) into @pages for all units.
* The allocation is for @chunk. Percpu core doesn't care about the
* content of @pages and will pass it verbatim to pcpu_map_pages().
*/
static int pcpu_alloc_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
{
const gfp_t gfp = GFP_KERNEL | __GFP_HIGHMEM | __GFP_COLD;
unsigned int cpu;
int i;
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page **pagep = &pages[pcpu_page_idx(cpu, i)];
*pagep = alloc_pages_node(cpu_to_node(cpu), gfp, 0);
if (!*pagep) {
pcpu_free_pages(chunk, pages, populated,
page_start, page_end);
return -ENOMEM;
}
}
}
return 0;
}
/**
* pcpu_pre_unmap_flush - flush cache prior to unmapping
* @chunk: chunk the regions to be flushed belongs to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages in [@page_start,@page_end) of @chunk are about to be
* unmapped. Flush cache. As each flushing trial can be very
* expensive, issue flush on the whole region at once rather than
* doing it for each cpu. This could be an overkill but is more
* scalable.
*/
static void pcpu_pre_unmap_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vunmap(
pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
}
static void __pcpu_unmap_pages(unsigned long addr, int nr_pages)
{
unmap_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT);
}
/**
* pcpu_unmap_pages - unmap pages out of a pcpu_chunk
* @pages: pages array which can be used to pass information to free
* @populated: populated bitmap
* @page_start: page index of the first page to unmap
* @page_end: page index of the last page to unmap + 1
*
* For each cpu, unmap pages [@page_start,@page_end) out of @chunk.
* Corresponding elements in @pages were cleared by the caller and can
* be used to carry information to pcpu_free_pages() which will be
* called after all unmaps are finished. The caller should call
* proper pre/post flush functions.
static void pcpu_unmap_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
for_each_possible_cpu(cpu) {
for (i = page_start; i < page_end; i++) {
struct page *page;
page = pcpu_chunk_page(chunk, cpu, i);
WARN_ON(!page);
pages[pcpu_page_idx(cpu, i)] = page;
}
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, cpu, page_start),
page_end - page_start);
}
for (i = page_start; i < page_end; i++)
__clear_bit(i, populated);
}
/**
* pcpu_post_unmap_tlb_flush - flush TLB after unmapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been unmapped. Flush
* TLB for the regions. This can be skipped if the area is to be
* returned to vmalloc as vmalloc will handle TLB flushing lazily.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_unmap_tlb_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_tlb_kernel_range(
pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
static int __pcpu_map_pages(unsigned long addr, struct page **pages,
int nr_pages)
{
return map_kernel_range_noflush(addr, nr_pages << PAGE_SHIFT,
PAGE_KERNEL, pages);
}
/**
* pcpu_map_pages - map pages into a pcpu_chunk
* @pages: pages array containing pages to be mapped
* @populated: populated bitmap
* @page_start: page index of the first page to map
* @page_end: page index of the last page to map + 1
*
* For each cpu, map pages [@page_start,@page_end) into @chunk. The
* caller is responsible for calling pcpu_post_map_flush() after all
* mappings are complete.
*
* This function is responsible for setting corresponding bits in
* @chunk->populated bitmap and whatever is necessary for reverse
* lookup (addr -> chunk).
static int pcpu_map_pages(struct pcpu_chunk *chunk,
struct page **pages, unsigned long *populated,
int page_start, int page_end)
for_each_possible_cpu(cpu) {
err = __pcpu_map_pages(pcpu_chunk_addr(chunk, cpu, page_start),
page_end - page_start);
if (err < 0)
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
/* mapping successful, link chunk and mark populated */
for (i = page_start; i < page_end; i++) {
for_each_possible_cpu(cpu)
pcpu_set_page_chunk(pages[pcpu_page_idx(cpu, i)],
chunk);
__set_bit(i, populated);
}
return 0;
err:
for_each_possible_cpu(tcpu) {
if (tcpu == cpu)
break;
__pcpu_unmap_pages(pcpu_chunk_addr(chunk, tcpu, page_start),
page_end - page_start);
}
return err;
}
/**
* pcpu_post_map_flush - flush cache after mapping
* @chunk: pcpu_chunk the regions to be flushed belong to
* @page_start: page index of the first page to be flushed
* @page_end: page index of the last page to be flushed + 1
*
* Pages [@page_start,@page_end) of @chunk have been mapped. Flush
* cache.
*
* As with pcpu_pre_unmap_flush(), TLB flushing also is done at once
* for the whole region.
*/
static void pcpu_post_map_flush(struct pcpu_chunk *chunk,
int page_start, int page_end)
{
flush_cache_vmap(
pcpu_chunk_addr(chunk, pcpu_first_unit_cpu, page_start),
pcpu_chunk_addr(chunk, pcpu_last_unit_cpu, page_end));
/**
* pcpu_depopulate_chunk - depopulate and unmap an area of a pcpu_chunk
* @chunk: chunk to depopulate
* @off: offset to the area to depopulate
* @size: size of the area to depopulate in bytes
* @flush: whether to flush cache and tlb or not
*
* For each cpu, depopulate and unmap pages [@page_start,@page_end)
* from @chunk. If @flush is true, vcache is flushed before unmapping
* and tlb after.
*
* CONTEXT:
* pcpu_alloc_mutex.
static void pcpu_depopulate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
struct page **pages;
unsigned long *populated;
int rs, re;
/* quick path, check whether it's empty already */
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
if (rs == page_start && re == page_end)
return;
break;
}
/* immutable chunks can't be depopulated */
WARN_ON(chunk->immutable);
/*
* If control reaches here, there must have been at least one
* successful population attempt so the temp pages array must
* be available now.
*/
pages = pcpu_get_pages_and_bitmap(chunk, &populated, false);
BUG_ON(!pages);
/* unmap and free */
pcpu_pre_unmap_flush(chunk, page_start, page_end);
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
pcpu_unmap_pages(chunk, pages, populated, rs, re);
/* no need to flush tlb, vmalloc will handle it lazily */
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end)
pcpu_free_pages(chunk, pages, populated, rs, re);
/* commit new bitmap */
bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
}
/**
* pcpu_populate_chunk - populate and map an area of a pcpu_chunk
* @chunk: chunk of interest
* @off: offset to the area to populate
*
* For each cpu, populate and map pages [@page_start,@page_end) into
* @chunk. The area is cleared on return.
*
* CONTEXT:
* pcpu_alloc_mutex, does GFP_KERNEL allocation.
*/
static int pcpu_populate_chunk(struct pcpu_chunk *chunk, int off, int size)
{
int page_start = PFN_DOWN(off);
int page_end = PFN_UP(off + size);
int free_end = page_start, unmap_end = page_start;
struct page **pages;
unsigned long *populated;
/* quick path, check whether all pages are already there */
pcpu_for_each_pop_region(chunk, rs, re, page_start, page_end) {
if (rs == page_start && re == page_end)
goto clear;
break;
}
/* need to allocate and map pages, this chunk can't be immutable */
WARN_ON(chunk->immutable);
pages = pcpu_get_pages_and_bitmap(chunk, &populated, true);
if (!pages)
return -ENOMEM;
/* alloc and map */
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
rc = pcpu_alloc_pages(chunk, pages, populated, rs, re);
if (rc)
goto err_free;
free_end = re;
pcpu_for_each_unpop_region(chunk, rs, re, page_start, page_end) {
rc = pcpu_map_pages(chunk, pages, populated, rs, re);
if (rc)
goto err_unmap;
unmap_end = re;
}
pcpu_post_map_flush(chunk, page_start, page_end);
/* commit new bitmap */
bitmap_copy(chunk->populated, populated, pcpu_unit_pages);
clear:
memset((void *)pcpu_chunk_addr(chunk, cpu, 0) + off, 0, size);
err_unmap:
pcpu_pre_unmap_flush(chunk, page_start, unmap_end);
pcpu_for_each_unpop_region(chunk, rs, re, page_start, unmap_end)
pcpu_unmap_pages(chunk, pages, populated, rs, re);
pcpu_post_unmap_tlb_flush(chunk, page_start, unmap_end);
err_free:
pcpu_for_each_unpop_region(chunk, rs, re, page_start, free_end)
pcpu_free_pages(chunk, pages, populated, rs, re);
return rc;
}
static void free_pcpu_chunk(struct pcpu_chunk *chunk)
{
if (!chunk)
return;
if (chunk->vm)
free_vm_area(chunk->vm);
pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0]));