Skip to content
numa.c 37.2 KiB
Newer Older
			 */
			if (node_ar.nid == nid) {
				dbg("reserve_bootmem %lx %lx nid=%d\n",
					physbase, reserve_size, node_ar.nid);
				reserve_bootmem_node(NODE_DATA(node_ar.nid),
						physbase, reserve_size,
						BOOTMEM_DEFAULT);
			}
			/*
			 * if reserved region is contained in the active region
			 * then done.
			 */
			if (end_pfn <= node_ar.end_pfn)
				break;

			/*
			 * reserved region extends past the active region
			 *   get next active region that contains this
			 *   reserved region
			 */
			start_pfn = node_ar.end_pfn;
			physbase = start_pfn << PAGE_SHIFT;
			size = size - reserve_size;
			get_node_active_region(start_pfn, &node_ar);
		}
	}
}


Linus Torvalds's avatar
Linus Torvalds committed
void __init do_init_bootmem(void)
{
	int nid;

	min_low_pfn = 0;
Yinghai Lu's avatar
Yinghai Lu committed
	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
Linus Torvalds's avatar
Linus Torvalds committed
	max_pfn = max_low_pfn;

	if (parse_numa_properties())
		setup_nonnuma();
	else
		dump_numa_memory_topology();
Linus Torvalds's avatar
Linus Torvalds committed

	for_each_online_node(nid) {
		unsigned long start_pfn, end_pfn;
Linus Torvalds's avatar
Linus Torvalds committed
		unsigned long bootmap_pages;

		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
Linus Torvalds's avatar
Linus Torvalds committed

		/*
		 * Allocate the node structure node local if possible
		 *
		 * Be careful moving this around, as it relies on all
		 * previous nodes' bootmem to be initialized and have
		 * all reserved areas marked.
		 */
		NODE_DATA(nid) = careful_zallocation(nid,
Linus Torvalds's avatar
Linus Torvalds committed
					sizeof(struct pglist_data),
					SMP_CACHE_BYTES, end_pfn);
Linus Torvalds's avatar
Linus Torvalds committed

  		dbg("node %d\n", nid);
		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));

		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
		NODE_DATA(nid)->node_start_pfn = start_pfn;
		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
Linus Torvalds's avatar
Linus Torvalds committed

		if (NODE_DATA(nid)->node_spanned_pages == 0)
  			continue;

  		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
  		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
Linus Torvalds's avatar
Linus Torvalds committed

		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
		bootmem_vaddr = careful_zallocation(nid,
					bootmap_pages << PAGE_SHIFT,
					PAGE_SIZE, end_pfn);
Linus Torvalds's avatar
Linus Torvalds committed

		dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
Linus Torvalds's avatar
Linus Torvalds committed

		init_bootmem_node(NODE_DATA(nid),
				  __pa(bootmem_vaddr) >> PAGE_SHIFT,
				  start_pfn, end_pfn);
Linus Torvalds's avatar
Linus Torvalds committed

		free_bootmem_with_active_regions(nid, end_pfn);
		/*
		 * Be very careful about moving this around.  Future
		 * calls to careful_zallocation() depend on this getting
		 * done correctly.
		 */
		mark_reserved_regions_for_nid(nid);
		sparse_memory_present_with_active_regions(nid);

	/*
	 * Now bootmem is initialised we can create the node to cpumask
	 * lookup tables and setup the cpu callback to populate them.
	 */
	setup_node_to_cpumask_map();

	register_cpu_notifier(&ppc64_numa_nb);
	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
			  (void *)(unsigned long)boot_cpuid);
Linus Torvalds's avatar
Linus Torvalds committed
}

void __init paging_init(void)
{
	unsigned long max_zone_pfns[MAX_NR_ZONES];
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
Yinghai Lu's avatar
Yinghai Lu committed
	max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
	free_area_init_nodes(max_zone_pfns);
Linus Torvalds's avatar
Linus Torvalds committed
}

static int __init early_numa(char *p)
{
	if (!p)
		return 0;

	if (strstr(p, "off"))
		numa_enabled = 0;

	if (strstr(p, "debug"))
		numa_debug = 1;

	p = strstr(p, "fake=");
	if (p)
		cmdline = p + strlen("fake=");

Linus Torvalds's avatar
Linus Torvalds committed
	return 0;
}
early_param("numa", early_numa);
 * Find the node associated with a hot added memory section for
 * memory represented in the device tree by the property
 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
 */
static int hot_add_drconf_scn_to_nid(struct device_node *memory,
				     unsigned long scn_addr)
{
	const u32 *dm;
	unsigned int drconf_cell_cnt, rc;
	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
	if (!drconf_cell_cnt)
		return -1;
	lmb_size = of_get_lmb_size(memory);
	if (!lmb_size)
	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
		struct of_drconf_cell drmem;

		read_drconf_cell(&drmem, &dm);

		/* skip this block if it is reserved or not assigned to
		 * this partition */
		if ((drmem.flags & DRCONF_MEM_RESERVED)
		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
			continue;

		if ((scn_addr < drmem.base_addr)
		    || (scn_addr >= (drmem.base_addr + lmb_size)))
		nid = of_drconf_to_nid_single(&drmem, &aa);
		break;
	}

	return nid;
}

/*
 * Find the node associated with a hot added memory section for memory
 * represented in the device tree as a node (i.e. memory@XXXX) for
Yinghai Lu's avatar
Yinghai Lu committed
 * each memblock.
 */
int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
	struct device_node *memory = NULL;
	int nid = -1;

	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
		unsigned long start, size;
		int ranges;
		const unsigned int *memcell_buf;
		unsigned int len;

		memcell_buf = of_get_property(memory, "reg", &len);
		if (!memcell_buf || len <= 0)
			continue;

		/* ranges in cell */
		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);

		while (ranges--) {
			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
			size = read_n_cells(n_mem_size_cells, &memcell_buf);

			if ((scn_addr < start) || (scn_addr >= (start + size)))
				continue;

			nid = of_node_to_nid_single(memory);
			break;
		}
		of_node_put(memory);
		if (nid >= 0)
			break;
/*
 * Find the node associated with a hot added memory section.  Section
Yinghai Lu's avatar
Yinghai Lu committed
 * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
 * sections are fully contained within a single MEMBLOCK.
 */
int hot_add_scn_to_nid(unsigned long scn_addr)
{
	struct device_node *memory = NULL;
	int nid, found = 0;

	if (!numa_enabled || (min_common_depth < 0))
		return first_online_node;

	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
	if (memory) {
		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
		of_node_put(memory);
	} else {
		nid = hot_add_node_scn_to_nid(scn_addr);
	if (nid < 0 || !node_online(nid))
		nid = first_online_node;
	if (NODE_DATA(nid)->node_spanned_pages)
		return nid;
	for_each_online_node(nid) {
		if (NODE_DATA(nid)->node_spanned_pages) {
			found = 1;
			break;

	BUG_ON(!found);
	return nid;
static u64 hot_add_drconf_memory_max(void)
{
        struct device_node *memory = NULL;
        unsigned int drconf_cell_cnt = 0;
        u64 lmb_size = 0;
        const u32 *dm = 0;

        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
        if (memory) {
                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
                lmb_size = of_get_lmb_size(memory);
                of_node_put(memory);
        }
        return lmb_size * drconf_cell_cnt;
}

/*
 * memory_hotplug_max - return max address of memory that may be added
 *
 * This is currently only used on systems that support drconfig memory
 * hotplug.
 */
u64 memory_hotplug_max(void)
{
        return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
}
#endif /* CONFIG_MEMORY_HOTPLUG */
/* Virtual Processor Home Node (VPHN) support */
static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
static cpumask_t cpu_associativity_changes_mask;
static int vphn_enabled;
static void set_topology_timer(void);

/*
 * Store the current values of the associativity change counters in the
 * hypervisor.
 */
static void setup_cpu_associativity_change_counters(void)
{
	/* The VPHN feature supports a maximum of 8 reference points */
	BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);

		u8 *counts = vphn_cpu_change_counts[cpu];
		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;

		for (i = 0; i < distance_ref_points_depth; i++)
			counts[i] = hypervisor_counts[i];
	}
}

/*
 * The hypervisor maintains a set of 8 associativity change counters in
 * the VPA of each cpu that correspond to the associativity levels in the
 * ibm,associativity-reference-points property. When an associativity
 * level changes, the corresponding counter is incremented.
 *
 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
 * node associativity levels have changed.
 *
 * Returns the number of cpus with unhandled associativity changes.
 */
static int update_cpu_associativity_changes_mask(void)
{
	cpumask_t *changes = &cpu_associativity_changes_mask;

	cpumask_clear(changes);

	for_each_possible_cpu(cpu) {
		int i, changed = 0;
		u8 *counts = vphn_cpu_change_counts[cpu];
		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;

		for (i = 0; i < distance_ref_points_depth; i++) {
			if (hypervisor_counts[i] != counts[i]) {
				counts[i] = hypervisor_counts[i];
				changed = 1;
			}
		}
		if (changed) {
			cpumask_set_cpu(cpu, changes);
			nr_cpus++;
		}
	}

	return nr_cpus;
}

/*
 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
 * the complete property we have to add the length in the first cell.
 */
#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)

/*
 * Convert the associativity domain numbers returned from the hypervisor
 * to the sequence they would appear in the ibm,associativity property.
 */
static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
{
	const u16 *field = (const u16*) packed;

#define VPHN_FIELD_UNUSED	(0xffff)
#define VPHN_FIELD_MSB		(0x8000)
#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)

	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
		if (*field == VPHN_FIELD_UNUSED) {
			/* All significant fields processed, and remaining
			 * fields contain the reserved value of all 1's.
			 * Just store them.
			 */
			unpacked[i] = *((u32*)field);
			field += 2;
		} else if (*field & VPHN_FIELD_MSB) {
			/* Data is in the lower 15 bits of this field */
			unpacked[i] = *field & VPHN_FIELD_MASK;
			field++;
			nr_assoc_doms++;
			/* Data is in the lower 15 bits of this field
			 * concatenated with the next 16 bit field
			 */
			unpacked[i] = *((u32*)field);
			field += 2;
			nr_assoc_doms++;
		}
	}

	/* The first cell contains the length of the property */
	unpacked[0] = nr_assoc_doms;

	return nr_assoc_doms;
}

/*
 * Retrieve the new associativity information for a virtual processor's
 * home node.
 */
static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
{
	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
	u64 flags = 1;
	int hwcpu = get_hard_smp_processor_id(cpu);

	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
	vphn_unpack_associativity(retbuf, associativity);

	return rc;
}

static long vphn_get_associativity(unsigned long cpu,
					unsigned int *associativity)
{

	rc = hcall_vphn(cpu, associativity);

	switch (rc) {
	case H_FUNCTION:
		printk(KERN_INFO
			"VPHN is not supported. Disabling polling...\n");
		stop_topology_update();
		break;
	case H_HARDWARE:
		printk(KERN_ERR
			"hcall_vphn() experienced a hardware fault "
			"preventing VPHN. Disabling polling...\n");
		stop_topology_update();
	}

	return rc;
}

/*
 * Update the node maps and sysfs entries for each cpu whose home node
 * has changed.
 */
int arch_update_cpu_topology(void)
{
	unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};

	for_each_cpu_mask(cpu, cpu_associativity_changes_mask) {
		vphn_get_associativity(cpu, associativity);
		nid = associativity_to_nid(associativity);

		if (nid < 0 || !node_online(nid))
			nid = first_online_node;

		old_nid = numa_cpu_lookup_table[cpu];

		/* Disable hotplug while we update the cpu
		 * masks and sysfs.
		 */
		get_online_cpus();
		unregister_cpu_under_node(cpu, old_nid);
		unmap_cpu_from_node(cpu);
		map_cpu_to_node(cpu, nid);
		register_cpu_under_node(cpu, nid);
		put_online_cpus();

		sysdev = get_cpu_sysdev(cpu);
		if (sysdev)
			kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
	}

	return 1;
}

static void topology_work_fn(struct work_struct *work)
{
	rebuild_sched_domains();
}
static DECLARE_WORK(topology_work, topology_work_fn);

void topology_schedule_update(void)
{
	schedule_work(&topology_work);
}

static void topology_timer_fn(unsigned long ignored)
{
	if (!vphn_enabled)
		return;
	if (update_cpu_associativity_changes_mask() > 0)
		topology_schedule_update();
	set_topology_timer();
}
static struct timer_list topology_timer =
	TIMER_INITIALIZER(topology_timer_fn, 0, 0);

static void set_topology_timer(void)
{
	topology_timer.data = 0;
	topology_timer.expires = jiffies + 60 * HZ;
	add_timer(&topology_timer);
}

/*
 * Start polling for VPHN associativity changes.
 */
int start_topology_update(void)
{
	int rc = 0;

	/* Disabled until races with load balancing are fixed */
	if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
		vphn_enabled = 1;
		setup_cpu_associativity_change_counters();
		init_timer_deferrable(&topology_timer);
		set_topology_timer();
		rc = 1;
	}

	return rc;
}
__initcall(start_topology_update);

/*
 * Disable polling for VPHN associativity changes.
 */
int stop_topology_update(void)
{
	vphn_enabled = 0;
	return del_timer_sync(&topology_timer);
}
#endif /* CONFIG_PPC_SPLPAR */