percpu.c

{
	static struct vm_struct vm;
	const size_t lpage_size = ai->atom_size;
	size_t chunk_size, map_size;
	unsigned int cpu;
	int i, j, unit, nr_units, rc;

	nr_units = 0;
	for (i = 0; i < ai->nr_groups; i++)
		nr_units += ai->groups[i].nr_units;

	chunk_size = ai->unit_size * nr_units;
	BUG_ON(chunk_size % lpage_size);

	pcpul_size = ai->static_size + ai->reserved_size + ai->dyn_size;
	pcpul_lpage_size = lpage_size;
	pcpul_nr_lpages = chunk_size / lpage_size;

	/* allocate pointer array and alloc large pages */
	map_size = pcpul_nr_lpages * sizeof(pcpul_map[0]);
	pcpul_map = alloc_bootmem(map_size);

	/* allocate all pages */
	for (i = 0; i < pcpul_nr_lpages; i++) {
		size_t offset = i * lpage_size;
		int first_unit = offset / ai->unit_size;
		int last_unit = (offset + lpage_size - 1) / ai->unit_size;
		void *ptr;

		/* find out which cpu is mapped to this unit */
		for (unit = first_unit; unit <= last_unit; unit++)
			if (pcpul_unit_to_cpu(unit, ai, &cpu))
				goto found;
		continue;
	found:
		ptr = alloc_fn(cpu, lpage_size, lpage_size);
		if (!ptr) {
			pr_warning("PERCPU: failed to allocate large page "
				   "for cpu%u\n", cpu);
			goto enomem;
		}

		pcpul_map[i].ptr = ptr;
	}

	/* return unused holes */
	for (unit = 0; unit < nr_units; unit++) {
		size_t start = unit * ai->unit_size;
		size_t end = start + ai->unit_size;
		size_t off, next;

		/* don't free used part of occupied unit */
		if (pcpul_unit_to_cpu(unit, ai, NULL))
			start += pcpul_size;

		/* unit can span more than one page, punch the holes */
		for (off = start; off < end; off = next) {
			void *ptr = pcpul_map[off / lpage_size].ptr;
			next = min(roundup(off + 1, lpage_size), end);
			if (ptr)
				free_fn(ptr + off % lpage_size, next - off);
		}
	}

	/* allocate address, map and copy */
	vm.flags = VM_ALLOC;
	vm.size = chunk_size;
	vm_area_register_early(&vm, ai->unit_size);

	for (i = 0; i < pcpul_nr_lpages; i++) {
		if (!pcpul_map[i].ptr)
			continue;
		pcpul_map[i].map_addr = vm.addr + i * lpage_size;
		map_fn(pcpul_map[i].ptr, lpage_size, pcpul_map[i].map_addr);
	}

	for_each_possible_cpu(cpu)
		memcpy(vm.addr + pcpul_cpu_to_unit(cpu, ai) * ai->unit_size,
		       __per_cpu_load, ai->static_size);

	/* we're ready, commit */
	pr_info("PERCPU: large pages @%p s%zu r%zu d%zu u%zu\n",
		vm.addr, ai->static_size, ai->reserved_size, ai->dyn_size,
		ai->unit_size);

	rc = pcpu_setup_first_chunk(ai, vm.addr);

	/*
	 * Sort pcpul_map array for pcpu_lpage_remapped().  Unmapped
	 * lpages are pushed to the end and trimmed.
	 */
	for (i = 0; i < pcpul_nr_lpages - 1; i++)
		for (j = i + 1; j < pcpul_nr_lpages; j++) {
			struct pcpul_ent tmp;

			if (!pcpul_map[j].ptr)
				continue;
			if (pcpul_map[i].ptr &&
			    pcpul_map[i].ptr < pcpul_map[j].ptr)
				continue;

			tmp = pcpul_map[i];
			pcpul_map[i] = pcpul_map[j];
			pcpul_map[j] = tmp;
		}

	while (pcpul_nr_lpages && !pcpul_map[pcpul_nr_lpages - 1].ptr)
		pcpul_nr_lpages--;

	return rc;

enomem:
	for (i = 0; i < pcpul_nr_lpages; i++)
		if (pcpul_map[i].ptr)
			free_fn(pcpul_map[i].ptr, lpage_size);
	free_bootmem(__pa(pcpul_map), map_size);
	return -ENOMEM;
}

/**
 * pcpu_lpage_remapped - determine whether a kaddr is in pcpul recycled area
 * @kaddr: the kernel address in question
 *
 * Determine whether @kaddr falls in the pcpul recycled area.  This is
 * used by pageattr to detect VM aliases and break up the pcpu large
 * page mapping such that the same physical page is not mapped under
 * different attributes.
 *
 * The recycled area is always at the tail of a partially used large
 * page.
 *
 * RETURNS:
 * Address of corresponding remapped pcpu address if match is found;
 * otherwise, NULL.
 */
void *pcpu_lpage_remapped(void *kaddr)
{
	unsigned long lpage_mask = pcpul_lpage_size - 1;
	void *lpage_addr = (void *)((unsigned long)kaddr & ~lpage_mask);
	unsigned long offset = (unsigned long)kaddr & lpage_mask;
	int left = 0, right = pcpul_nr_lpages - 1;
	int pos;

	/* pcpul in use at all? */
	if (!pcpul_map)
		return NULL;

	/* okay, perform binary search */
	while (left <= right) {
		pos = (left + right) / 2;

		if (pcpul_map[pos].ptr < lpage_addr)
			left = pos + 1;
		else if (pcpul_map[pos].ptr > lpage_addr)
			right = pos - 1;
		else
			return pcpul_map[pos].map_addr + offset;
	}

	return NULL;
}
#endif /* CONFIG_NEED_PER_CPU_LPAGE_FIRST_CHUNK */

/*
 * Generic percpu area setup.
 *
 * The embedding helper is used because its behavior closely resembles
 * the original non-dynamic generic percpu area setup.  This is
 * important because many archs have addressing restrictions and might
 * fail if the percpu area is located far away from the previous
 * location.  As an added bonus, in non-NUMA cases, embedding is
 * generally a good idea TLB-wise because percpu area can piggy back
 * on the physical linear memory mapping which uses large page
 * mappings on applicable archs.
 */
#ifndef CONFIG_HAVE_SETUP_PER_CPU_AREA
unsigned long __per_cpu_offset[NR_CPUS] __read_mostly;
EXPORT_SYMBOL(__per_cpu_offset);

void __init setup_per_cpu_areas(void)
{
	unsigned long delta;
	unsigned int cpu;
	int rc;

	/*
	 * Always reserve area for module percpu variables.  That's
	 * what the legacy allocator did.
	 */
	rc = pcpu_embed_first_chunk(PERCPU_MODULE_RESERVE,
				    PERCPU_DYNAMIC_RESERVE);
	if (rc < 0)
		panic("Failed to initialized percpu areas.");

	delta = (unsigned long)pcpu_base_addr - (unsigned long)__per_cpu_start;
	for_each_possible_cpu(cpu)
		__per_cpu_offset[cpu] = delta + pcpu_unit_offsets[cpu];
}
#endif /* CONFIG_HAVE_SETUP_PER_CPU_AREA */