percpu.c

 * it can also return pages after the static area.  NULL return
 * indicates end of pages for the cpu.  Note that @get_page_fn() must
 * return the same number of pages for all cpus.
 *
 * @reserved_size, if non-zero, specifies the amount of bytes to
 * reserve after the static area in the first chunk.  This reserves
 * the first chunk such that it's available only through reserved
 * percpu allocation.  This is primarily used to serve module percpu
 * static areas on architectures where the addressing model has
 * limited offset range for symbol relocations to guarantee module
 * percpu symbols fall inside the relocatable range.
 *
 * @dyn_size, if non-negative, determines the number of bytes
 * available for dynamic allocation in the first chunk.  Specifying
 * non-negative value makes percpu leave alone the area beyond
 * @static_size + @reserved_size + @dyn_size.
 *
 * @unit_size, if non-negative, specifies unit size and must be
 * aligned to PAGE_SIZE and equal to or larger than @static_size +
 * @reserved_size + if non-negative, @dyn_size.
 *
 * Non-null @base_addr means that the caller already allocated virtual
 * region for the first chunk and mapped it.  percpu must not mess
 * with the chunk.  Note that @base_addr with 0 @unit_size or non-NULL
 * @populate_pte_fn doesn't make any sense.
 *
 * @populate_pte_fn is used to populate the pagetable.  NULL means the
 * caller already populated the pagetable.
 *
 * If the first chunk ends up with both reserved and dynamic areas, it
 * is served by two chunks - one to serve the core static and reserved
 * areas and the other for the dynamic area.  They share the same vm
 * and page map but uses different area allocation map to stay away
 * from each other.  The latter chunk is circulated in the chunk slots
 * and available for dynamic allocation like any other chunks.
 *
 * RETURNS:
 * The determined pcpu_unit_size which can be used to initialize
 * percpu access.
 */
size_t __init pcpu_setup_first_chunk(pcpu_get_page_fn_t get_page_fn,
				     size_t static_size, size_t reserved_size,
				     ssize_t dyn_size, ssize_t unit_size,
				     void *base_addr,
				     pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
	static struct vm_struct first_vm;
	static int smap[2], dmap[2];
	size_t size_sum = static_size + reserved_size +
			  (dyn_size >= 0 ? dyn_size : 0);
	struct pcpu_chunk *schunk, *dchunk = NULL;
	unsigned int cpu;
	int nr_pages;
	int err, i;

	/* santiy checks */
	BUILD_BUG_ON(ARRAY_SIZE(smap) >= PCPU_DFL_MAP_ALLOC ||
		     ARRAY_SIZE(dmap) >= PCPU_DFL_MAP_ALLOC);
	BUG_ON(!static_size);
	if (unit_size >= 0) {
		BUG_ON(unit_size < size_sum);
		BUG_ON(unit_size & ~PAGE_MASK);
		BUG_ON(unit_size < PCPU_MIN_UNIT_SIZE);
	} else
		BUG_ON(base_addr);
	BUG_ON(base_addr && populate_pte_fn);

	if (unit_size >= 0)
		pcpu_unit_pages = unit_size >> PAGE_SHIFT;
	else
		pcpu_unit_pages = max_t(int, PCPU_MIN_UNIT_SIZE >> PAGE_SHIFT,
					PFN_UP(size_sum));

	pcpu_unit_size = pcpu_unit_pages << PAGE_SHIFT;
	pcpu_chunk_size = num_possible_cpus() * pcpu_unit_size;
	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk)
		+ num_possible_cpus() * pcpu_unit_pages * sizeof(struct page *);

	if (dyn_size < 0)
		dyn_size = pcpu_unit_size - static_size - reserved_size;

	/*
	 * Allocate chunk slots.  The additional last slot is for
	 * empty chunks.
	 */
	pcpu_nr_slots = __pcpu_size_to_slot(pcpu_unit_size) + 2;
	pcpu_slot = alloc_bootmem(pcpu_nr_slots * sizeof(pcpu_slot[0]));
	for (i = 0; i < pcpu_nr_slots; i++)
		INIT_LIST_HEAD(&pcpu_slot[i]);

	/*
	 * Initialize static chunk.  If reserved_size is zero, the
	 * static chunk covers static area + dynamic allocation area
	 * in the first chunk.  If reserved_size is not zero, it
	 * covers static area + reserved area (mostly used for module
	 * static percpu allocation).
	 */
	schunk = alloc_bootmem(pcpu_chunk_struct_size);
	INIT_LIST_HEAD(&schunk->list);
	schunk->vm = &first_vm;
	schunk->map = smap;
	schunk->map_alloc = ARRAY_SIZE(smap);
	schunk->page = schunk->page_ar;

	if (reserved_size) {
		schunk->free_size = reserved_size;
		pcpu_reserved_chunk = schunk;
		pcpu_reserved_chunk_limit = static_size + reserved_size;
	} else {
		schunk->free_size = dyn_size;
		dyn_size = 0;			/* dynamic area covered */
	}
	schunk->contig_hint = schunk->free_size;

	schunk->map[schunk->map_used++] = -static_size;
	if (schunk->free_size)
		schunk->map[schunk->map_used++] = schunk->free_size;

	/* init dynamic chunk if necessary */
	if (dyn_size) {
		dchunk = alloc_bootmem(sizeof(struct pcpu_chunk));
		INIT_LIST_HEAD(&dchunk->list);
		dchunk->vm = &first_vm;
		dchunk->map = dmap;
		dchunk->map_alloc = ARRAY_SIZE(dmap);
		dchunk->page = schunk->page_ar;	/* share page map with schunk */

		dchunk->contig_hint = dchunk->free_size = dyn_size;
		dchunk->map[dchunk->map_used++] = -pcpu_reserved_chunk_limit;
		dchunk->map[dchunk->map_used++] = dchunk->free_size;
	}

	/* allocate vm address */
	first_vm.flags = VM_ALLOC;
	first_vm.size = pcpu_chunk_size;

	if (!base_addr)
		vm_area_register_early(&first_vm, PAGE_SIZE);
	else {
		/*
		 * Pages already mapped.  No need to remap into
		 * vmalloc area.  In this case the first chunks can't
		 * be mapped or unmapped by percpu and are marked
		 * immutable.
		 */
		first_vm.addr = base_addr;
		schunk->immutable = true;
		if (dchunk)
			dchunk->immutable = true;
	}

	/* assign pages */
	nr_pages = -1;
	for_each_possible_cpu(cpu) {
		for (i = 0; i < pcpu_unit_pages; i++) {
			struct page *page = get_page_fn(cpu, i);

			if (!page)
				break;
			*pcpu_chunk_pagep(schunk, cpu, i) = page;
		}

		BUG_ON(i < PFN_UP(static_size));

		if (nr_pages < 0)
			nr_pages = i;
		else
			BUG_ON(nr_pages != i);
	}

	/* map them */
	if (populate_pte_fn) {
		for_each_possible_cpu(cpu)
			for (i = 0; i < nr_pages; i++)
				populate_pte_fn(pcpu_chunk_addr(schunk,
								cpu, i));

		err = pcpu_map(schunk, 0, nr_pages);
		if (err)
			panic("failed to setup static percpu area, err=%d\n",
			      err);
	}

	/* link the first chunk in */
	pcpu_first_chunk = dchunk ?: schunk;
	pcpu_chunk_relocate(pcpu_first_chunk, -1);

	/* we're done */
	pcpu_base_addr = (void *)pcpu_chunk_addr(schunk, 0, 0);
	return pcpu_unit_size;
}

static size_t pcpu_calc_fc_sizes(size_t static_size, size_t reserved_size,
				 ssize_t *dyn_sizep)
{
	size_t size_sum;

	size_sum = PFN_ALIGN(static_size + reserved_size +
			     (*dyn_sizep >= 0 ? *dyn_sizep : 0));
	if (*dyn_sizep != 0)
		*dyn_sizep = size_sum - static_size - reserved_size;

	return size_sum;
}

/*
 * Embedding first chunk setup helper.
 */
static void *pcpue_ptr __initdata;
static size_t pcpue_size __initdata;
static size_t pcpue_unit_size __initdata;

static struct page * __init pcpue_get_page(unsigned int cpu, int pageno)
{
	size_t off = (size_t)pageno << PAGE_SHIFT;

	if (off >= pcpue_size)
		return NULL;

	return virt_to_page(pcpue_ptr + cpu * pcpue_unit_size + off);
}

/**
 * pcpu_embed_first_chunk - embed the first percpu chunk into bootmem
 * @static_size: the size of static percpu area in bytes
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * If this function is used to setup the first chunk, it is allocated
 * as a contiguous area using bootmem allocator and used as-is without
 * being mapped into vmalloc area.  This enables the first chunk to
 * piggy back on the linear physical mapping which often uses larger
 * page size.
 *
 * When @dyn_size is positive, dynamic area might be larger than
 * specified to fill page alignment.  When @dyn_size is auto,
 * @dyn_size is just big enough to fill page alignment after static
 * and reserved areas.
 *
 * If the needed size is smaller than the minimum or specified unit
 * size, the leftover is returned to the bootmem allocator.
 *
 * RETURNS:
 * The determined pcpu_unit_size which can be used to initialize
 * percpu access on success, -errno on failure.
 */
ssize_t __init pcpu_embed_first_chunk(size_t static_size, size_t reserved_size,
				      ssize_t dyn_size)
{
	size_t chunk_size;
	unsigned int cpu;

	/* determine parameters and allocate */
	pcpue_size = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);

	pcpue_unit_size = max_t(size_t, pcpue_size, PCPU_MIN_UNIT_SIZE);
	chunk_size = pcpue_unit_size * num_possible_cpus();

	pcpue_ptr = __alloc_bootmem_nopanic(chunk_size, PAGE_SIZE,
					    __pa(MAX_DMA_ADDRESS));
	if (!pcpue_ptr) {
		pr_warning("PERCPU: failed to allocate %zu bytes for "
			   "embedding\n", chunk_size);
		return -ENOMEM;
	}

	/* return the leftover and copy */
	for_each_possible_cpu(cpu) {
		void *ptr = pcpue_ptr + cpu * pcpue_unit_size;

		free_bootmem(__pa(ptr + pcpue_size),
			     pcpue_unit_size - pcpue_size);
		memcpy(ptr, __per_cpu_load, static_size);
	}

	/* we're ready, commit */
	pr_info("PERCPU: Embedded %zu pages at %p, static data %zu bytes\n",
		pcpue_size >> PAGE_SHIFT, pcpue_ptr, static_size);

	return pcpu_setup_first_chunk(pcpue_get_page, static_size,
				      reserved_size, dyn_size,
				      pcpue_unit_size, pcpue_ptr, NULL);
}

/*
 * 4k page first chunk setup helper.
 */
static struct page **pcpu4k_pages __initdata;
static int pcpu4k_unit_pages __initdata;

static struct page * __init pcpu4k_get_page(unsigned int cpu, int pageno)
{
	if (pageno < pcpu4k_unit_pages)
		return pcpu4k_pages[cpu * pcpu4k_unit_pages + pageno];
	return NULL;
}

/**
 * pcpu_4k_first_chunk - map the first chunk using PAGE_SIZE pages
 * @static_size: the size of static percpu area in bytes
 * @reserved_size: the size of reserved percpu area in bytes
 * @alloc_fn: function to allocate percpu page, always called with PAGE_SIZE
 * @free_fn: funtion to free percpu page, always called with PAGE_SIZE
 * @populate_pte_fn: function to populate pte
 *
 * This is a helper to ease setting up embedded first percpu chunk and
 * can be called where pcpu_setup_first_chunk() is expected.
 *
 * This is the basic allocator.  Static percpu area is allocated
 * page-by-page into vmalloc area.
 *
 * RETURNS:
 * The determined pcpu_unit_size which can be used to initialize
 * percpu access on success, -errno on failure.
 */
ssize_t __init pcpu_4k_first_chunk(size_t static_size, size_t reserved_size,
				   pcpu_fc_alloc_fn_t alloc_fn,
				   pcpu_fc_free_fn_t free_fn,
				   pcpu_fc_populate_pte_fn_t populate_pte_fn)
{
	static struct vm_struct vm;
	size_t pages_size;
	unsigned int cpu;
	int i, j;
	ssize_t ret;

	pcpu4k_unit_pages = PFN_UP(max_t(size_t, static_size + reserved_size,
					 PCPU_MIN_UNIT_SIZE));

	/* unaligned allocations can't be freed, round up to page size */
	pages_size = PFN_ALIGN(pcpu4k_unit_pages * num_possible_cpus() *
			       sizeof(pcpu4k_pages[0]));
	pcpu4k_pages = alloc_bootmem(pages_size);

	/* allocate pages */
	j = 0;
	for_each_possible_cpu(cpu)
		for (i = 0; i < pcpu4k_unit_pages; i++) {
			void *ptr;

			ptr = alloc_fn(cpu, PAGE_SIZE);
			if (!ptr) {
				pr_warning("PERCPU: failed to allocate "
					   "4k page for cpu%u\n", cpu);
				goto enomem;
			}
			pcpu4k_pages[j++] = virt_to_page(ptr);
		}

	/* allocate vm area, map the pages and copy static data */
	vm.flags = VM_ALLOC;
	vm.size = num_possible_cpus() * pcpu4k_unit_pages << PAGE_SHIFT;
	vm_area_register_early(&vm, PAGE_SIZE);

	for_each_possible_cpu(cpu) {
		unsigned long unit_addr = (unsigned long)vm.addr +
			(cpu * pcpu4k_unit_pages << PAGE_SHIFT);

		for (i = 0; i < pcpu4k_unit_pages; i++)
			populate_pte_fn(unit_addr + (i << PAGE_SHIFT));

		/* pte already populated, the following shouldn't fail */
		ret = __pcpu_map_pages(unit_addr,
				       &pcpu4k_pages[cpu * pcpu4k_unit_pages],
				       pcpu4k_unit_pages);
		if (ret < 0)
			panic("failed to map percpu area, err=%zd\n", ret);

		/*
		 * FIXME: Archs with virtual cache should flush local
		 * cache for the linear mapping here - something
		 * equivalent to flush_cache_vmap() on the local cpu.
		 * flush_cache_vmap() can't be used as most supporting
		 * data structures are not set up yet.
		 */

		/* copy static data */
		memcpy((void *)unit_addr, __per_cpu_load, static_size);
	}

	/* we're ready, commit */
	pr_info("PERCPU: %d 4k pages per cpu, static data %zu bytes\n",
		pcpu4k_unit_pages, static_size);

	ret = pcpu_setup_first_chunk(pcpu4k_get_page, static_size,
				     reserved_size, -1,
				     pcpu4k_unit_pages << PAGE_SHIFT, vm.addr,
				     NULL);
	goto out_free_ar;

enomem:
	while (--j >= 0)
		free_fn(page_address(pcpu4k_pages[j]), PAGE_SIZE);
	ret = -ENOMEM;
out_free_ar:
	free_bootmem(__pa(pcpu4k_pages), pages_size);
	return ret;
}

/*
 * Large page remapping first chunk setup helper
 */
#ifdef CONFIG_NEED_MULTIPLE_NODES
struct pcpul_ent {
	unsigned int	cpu;
	void		*ptr;
};

static size_t pcpul_size;
static size_t pcpul_unit_size;
static struct pcpul_ent *pcpul_map;
static struct vm_struct pcpul_vm;

static struct page * __init pcpul_get_page(unsigned int cpu, int pageno)
{
	size_t off = (size_t)pageno << PAGE_SHIFT;

	if (off >= pcpul_size)
		return NULL;

	return virt_to_page(pcpul_map[cpu].ptr + off);
}

/**
 * pcpu_lpage_first_chunk - remap the first percpu chunk using large page
 * @static_size: the size of static percpu area in bytes
 * @reserved_size: the size of reserved percpu area in bytes
 * @dyn_size: free size for dynamic allocation in bytes, -1 for auto
 * @lpage_size: the size of a large page
 * @alloc_fn: function to allocate percpu lpage, always called with lpage_size
 * @free_fn: function to free percpu memory, @size <= lpage_size
 * @map_fn: function to map percpu lpage, always called with lpage_size
 *
 * This allocator uses large page as unit.  A large page is allocated
 * for each cpu and each is remapped into vmalloc area using large
 * page mapping.  As large page can be quite large, only part of it is
 * used for the first chunk.  Unused part is returned to the bootmem
 * allocator.
 *
 * So, the large pages are mapped twice - once to the physical mapping
 * and to the vmalloc area for the first percpu chunk.  The double
 * mapping does add one more large TLB entry pressure but still is
 * much better than only using 4k mappings while still being NUMA
 * friendly.
 *
 * RETURNS:
 * The determined pcpu_unit_size which can be used to initialize
 * percpu access on success, -errno on failure.
 */
ssize_t __init pcpu_lpage_first_chunk(size_t static_size, size_t reserved_size,
				      ssize_t dyn_size, size_t lpage_size,
				      pcpu_fc_alloc_fn_t alloc_fn,
				      pcpu_fc_free_fn_t free_fn,
				      pcpu_fc_map_fn_t map_fn)
{
	size_t size_sum;
	size_t map_size;
	unsigned int cpu;
	int i, j;
	ssize_t ret;

	/*
	 * Currently supports only single page.  Supporting multiple
	 * pages won't be too difficult if it ever becomes necessary.
	 */
	size_sum = pcpu_calc_fc_sizes(static_size, reserved_size, &dyn_size);

	pcpul_unit_size = lpage_size;
	pcpul_size = max_t(size_t, size_sum, PCPU_MIN_UNIT_SIZE);
	if (pcpul_size > pcpul_unit_size) {
		pr_warning("PERCPU: static data is larger than large page, "
			   "can't use large page\n");
		return -EINVAL;
	}

	/* allocate pointer array and alloc large pages */
	map_size = PFN_ALIGN(num_possible_cpus() * sizeof(pcpul_map[0]));
	pcpul_map = alloc_bootmem(map_size);

	for_each_possible_cpu(cpu) {
		void *ptr;

		ptr = alloc_fn(cpu, lpage_size);
		if (!ptr) {
			pr_warning("PERCPU: failed to allocate large page "
				   "for cpu%u\n", cpu);
			goto enomem;
		}

		/*
		 * Only use pcpul_size bytes and give back the rest.
		 *
		 * Ingo: The lpage_size up-rounding bootmem is needed
		 * to make sure the partial lpage is still fully RAM -
		 * it's not well-specified to have a incompatible area
		 * (unmapped RAM, device memory, etc.) in that hole.
		 */
		free_fn(ptr + pcpul_size, lpage_size - pcpul_size);

		pcpul_map[cpu].cpu = cpu;
		pcpul_map[cpu].ptr = ptr;

		memcpy(ptr, __per_cpu_load, static_size);
	}

	/* allocate address and map */
	pcpul_vm.flags = VM_ALLOC;
	pcpul_vm.size = num_possible_cpus() * pcpul_unit_size;
	vm_area_register_early(&pcpul_vm, pcpul_unit_size);

	for_each_possible_cpu(cpu)
		map_fn(pcpul_map[cpu].ptr, pcpul_unit_size,
		       pcpul_vm.addr + cpu * pcpul_unit_size);

	/* we're ready, commit */
	pr_info("PERCPU: Remapped at %p with large pages, static data "
		"%zu bytes\n", pcpul_vm.addr, static_size);

	ret = pcpu_setup_first_chunk(pcpul_get_page, static_size,
				     reserved_size, dyn_size, pcpul_unit_size,
				     pcpul_vm.addr, NULL);

	/* sort pcpul_map array for pcpu_lpage_remapped() */
	for (i = 0; i < num_possible_cpus() - 1; i++)
		for (j = i + 1; j < num_possible_cpus(); j++)
			if (pcpul_map[i].ptr > pcpul_map[j].ptr) {
				struct pcpul_ent tmp = pcpul_map[i];
				pcpul_map[i] = pcpul_map[j];
				pcpul_map[j] = tmp;
			}

	return ret;

enomem:
	for_each_possible_cpu(cpu)
		if (pcpul_map[cpu].ptr)
			free_fn(pcpul_map[cpu].ptr, pcpul_size);
	free_bootmem(__pa(pcpul_map), map_size);
	return -ENOMEM;
}

/**
 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 * @kaddr: the kernel address in question
 *
 * Determine whether @kaddr falls in the pcpul recycled area.  This is
 * used by pageattr to detect VM aliases and break up the pcpu large
 * page mapping such that the same physical page is not mapped under
 * different attributes.
 *
 * The recycled area is always at the tail of a partially used large
 * page.
 *
 * RETURNS:
 * Address of corresponding remapped pcpu address if match is found;
 * otherwise, NULL.
 */
void *pcpu_lpage_remapped(void *kaddr)
{
	unsigned long unit_mask = pcpul_unit_size - 1;
	void *lpage_addr = (void *)((unsigned long)kaddr & ~unit_mask);
	unsigned long offset = (unsigned long)kaddr & unit_mask;
	int left = 0, right = num_possible_cpus() - 1;
	int pos;

	/* pcpul in use at all? */
	if (!pcpul_map)
		return NULL;

	/* okay, perform binary search */
	while (left <= right) {
		pos = (left + right) / 2;

		if (pcpul_map[pos].ptr < lpage_addr)
			left = pos + 1;
		else if (pcpul_map[pos].ptr > lpage_addr)
			right = pos - 1;
		else {
			/* it shouldn't be in the area for the first chunk */
			WARN_ON(offset < pcpul_size);

			return pcpul_vm.addr +
				pcpul_map[pos].cpu * pcpul_unit_size + offset;
		}
	}

	return NULL;
}
#endif

/*
 * Generic percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
	size_t static_size = __per_cpu_end - __per_cpu_start;
	ssize_t unit_size;
	unsigned long delta;
	unsigned int cpu;

	/*
	 * Always reserve area for module percpu variables.  That's
	 * what the legacy allocator did.
	 */
	unit_size = pcpu_embed_first_chunk(static_size, PERCPU_MODULE_RESERVE,
					   PERCPU_DYNAMIC_RESERVE);
	if (unit_size < 0)
		panic("Failed to initialized percpu areas.");

	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu)
		__per_cpu_offset[cpu] = delta + cpu * unit_size;
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */