numa.c

		 * Check to make sure that this memblock.reserved area is
		 * within the bounds of the node that we care about.
		 * Checking the nid of the start and end points is not
		 * sufficient because the reserved area could span the
		 * entire node.
		 */
		if (end_pfn <= node->node_start_pfn ||
		    start_pfn >= node_end_pfn)
			continue;

		get_node_active_region(start_pfn, &node_ar);
		while (start_pfn < end_pfn &&
			node_ar.start_pfn < node_ar.end_pfn) {
			unsigned long reserve_size = size;
			/*
			 * if reserved region extends past active region
			 * then trim size to active region
			 */
			if (end_pfn > node_ar.end_pfn)
				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
					- physbase;
			/*
			 * Only worry about *this* node, others may not
			 * yet have valid NODE_DATA().
			 */
			if (node_ar.nid == nid) {
				dbg("reserve_bootmem %lx %lx nid=%d\n",
					physbase, reserve_size, node_ar.nid);
				reserve_bootmem_node(NODE_DATA(node_ar.nid),
						physbase, reserve_size,
						BOOTMEM_DEFAULT);
			}
			/*
			 * if reserved region is contained in the active region
			 * then done.
			 */
			if (end_pfn <= node_ar.end_pfn)
				break;

			/*
			 * reserved region extends past the active region
			 *   get next active region that contains this
			 *   reserved region
			 */
			start_pfn = node_ar.end_pfn;
			physbase = start_pfn << PAGE_SHIFT;
			size = size - reserve_size;
			get_node_active_region(start_pfn, &node_ar);
		}
	}
}


void __init do_init_bootmem(void)
{
	int nid, cpu;

	min_low_pfn = 0;
	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
	max_pfn = max_low_pfn;

	if (parse_numa_properties())
		setup_nonnuma();
	else
		dump_numa_memory_topology();

	for_each_online_node(nid) {
		unsigned long start_pfn, end_pfn;
		void *bootmem_vaddr;
		unsigned long bootmap_pages;

		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);

		/*
		 * Allocate the node structure node local if possible
		 *
		 * Be careful moving this around, as it relies on all
		 * previous nodes' bootmem to be initialized and have
		 * all reserved areas marked.
		 */
		NODE_DATA(nid) = careful_zallocation(nid,
					sizeof(struct pglist_data),
					SMP_CACHE_BYTES, end_pfn);

  		dbg("node %d\n", nid);
		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));

		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
		NODE_DATA(nid)->node_start_pfn = start_pfn;
		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;

		if (NODE_DATA(nid)->node_spanned_pages == 0)
  			continue;

  		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
  		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);

		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
		bootmem_vaddr = careful_zallocation(nid,
					bootmap_pages << PAGE_SHIFT,
					PAGE_SIZE, end_pfn);

		dbg("bootmap_vaddr = %p\n", bootmem_vaddr);

		init_bootmem_node(NODE_DATA(nid),
				  __pa(bootmem_vaddr) >> PAGE_SHIFT,
				  start_pfn, end_pfn);

		free_bootmem_with_active_regions(nid, end_pfn);
		/*
		 * Be very careful about moving this around.  Future
		 * calls to careful_zallocation() depend on this getting
		 * done correctly.
		 */
		mark_reserved_regions_for_nid(nid);
		sparse_memory_present_with_active_regions(nid);
	}

	init_bootmem_done = 1;

	/*
	 * Now bootmem is initialised we can create the node to cpumask
	 * lookup tables and setup the cpu callback to populate them.
	 */
	setup_node_to_cpumask_map();

	reset_numa_cpu_lookup_table();
	register_cpu_notifier(&ppc64_numa_nb);
	/*
	 * We need the numa_cpu_lookup_table to be accurate for all CPUs,
	 * even before we online them, so that we can use cpu_to_{node,mem}
	 * early in boot, cf. smp_prepare_cpus().
	 */
	for_each_present_cpu(cpu) {
		numa_setup_cpu((unsigned long)cpu);
	}
}

static int __init early_numa(char *p)
{
	if (!p)
		return 0;

	if (strstr(p, "off"))
		numa_enabled = 0;

	if (strstr(p, "debug"))
		numa_debug = 1;

	p = strstr(p, "fake=");
	if (p)
		cmdline = p + strlen("fake=");

	return 0;
}
early_param("numa", early_numa);

static bool topology_updates_enabled = true;

static int __init early_topology_updates(char *p)
{
	if (!p)
		return 0;

	if (!strcmp(p, "off")) {
		pr_info("Disabling topology updates\n");
		topology_updates_enabled = false;
	}

	return 0;
}
early_param("topology_updates", early_topology_updates);

#ifdef CONFIG_MEMORY_HOTPLUG
/*
 * Find the node associated with a hot added memory section for
 * memory represented in the device tree by the property
 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
 */
static int hot_add_drconf_scn_to_nid(struct device_node *memory,
				     unsigned long scn_addr)
{
	const __be32 *dm;
	unsigned int drconf_cell_cnt, rc;
	unsigned long lmb_size;
	struct assoc_arrays aa;
	int nid = -1;

	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
	if (!drconf_cell_cnt)
		return -1;

	lmb_size = of_get_lmb_size(memory);
	if (!lmb_size)
		return -1;

	rc = of_get_assoc_arrays(memory, &aa);
	if (rc)
		return -1;

	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
		struct of_drconf_cell drmem;

		read_drconf_cell(&drmem, &dm);

		/* skip this block if it is reserved or not assigned to
		 * this partition */
		if ((drmem.flags & DRCONF_MEM_RESERVED)
		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
			continue;

		if ((scn_addr < drmem.base_addr)
		    || (scn_addr >= (drmem.base_addr + lmb_size)))
			continue;

		nid = of_drconf_to_nid_single(&drmem, &aa);
		break;
	}

	return nid;
}

/*
 * Find the node associated with a hot added memory section for memory
 * represented in the device tree as a node (i.e. memory@XXXX) for
 * each memblock.
 */
static int hot_add_node_scn_to_nid(unsigned long scn_addr)
{
	struct device_node *memory;
	int nid = -1;

	for_each_node_by_type(memory, "memory") {
		unsigned long start, size;
		int ranges;
		const __be32 *memcell_buf;
		unsigned int len;

		memcell_buf = of_get_property(memory, "reg", &len);
		if (!memcell_buf || len <= 0)
			continue;

		/* ranges in cell */
		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);

		while (ranges--) {
			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
			size = read_n_cells(n_mem_size_cells, &memcell_buf);

			if ((scn_addr < start) || (scn_addr >= (start + size)))
				continue;

			nid = of_node_to_nid_single(memory);
			break;
		}

		if (nid >= 0)
			break;
	}

	of_node_put(memory);

	return nid;
}

/*
 * Find the node associated with a hot added memory section.  Section
 * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
 * sections are fully contained within a single MEMBLOCK.
 */
int hot_add_scn_to_nid(unsigned long scn_addr)
{
	struct device_node *memory = NULL;
	int nid, found = 0;

	if (!numa_enabled || (min_common_depth < 0))
		return first_online_node;

	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
	if (memory) {
		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
		of_node_put(memory);
	} else {
		nid = hot_add_node_scn_to_nid(scn_addr);
	}

	if (nid < 0 || !node_online(nid))
		nid = first_online_node;

	if (NODE_DATA(nid)->node_spanned_pages)
		return nid;

	for_each_online_node(nid) {
		if (NODE_DATA(nid)->node_spanned_pages) {
			found = 1;
			break;
		}
	}

	BUG_ON(!found);
	return nid;
}

static u64 hot_add_drconf_memory_max(void)
{
        struct device_node *memory = NULL;
        unsigned int drconf_cell_cnt = 0;
        u64 lmb_size = 0;
	const __be32 *dm = NULL;

        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
        if (memory) {
                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
                lmb_size = of_get_lmb_size(memory);
                of_node_put(memory);
        }
        return lmb_size * drconf_cell_cnt;
}

/*
 * memory_hotplug_max - return max address of memory that may be added
 *
 * This is currently only used on systems that support drconfig memory
 * hotplug.
 */
u64 memory_hotplug_max(void)
{
        return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
}
#endif /* CONFIG_MEMORY_HOTPLUG */

/* Virtual Processor Home Node (VPHN) support */
#ifdef CONFIG_PPC_SPLPAR
struct topology_update_data {
	struct topology_update_data *next;
	unsigned int cpu;
	int old_nid;
	int new_nid;
};

static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
static cpumask_t cpu_associativity_changes_mask;
static int vphn_enabled;
static int prrn_enabled;
static void reset_topology_timer(void);

/*
 * Store the current values of the associativity change counters in the
 * hypervisor.
 */
static void setup_cpu_associativity_change_counters(void)
{
	int cpu;

	/* The VPHN feature supports a maximum of 8 reference points */
	BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);

	for_each_possible_cpu(cpu) {
		int i;
		u8 *counts = vphn_cpu_change_counts[cpu];
		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;

		for (i = 0; i < distance_ref_points_depth; i++)
			counts[i] = hypervisor_counts[i];
	}
}

/*
 * The hypervisor maintains a set of 8 associativity change counters in
 * the VPA of each cpu that correspond to the associativity levels in the
 * ibm,associativity-reference-points property. When an associativity
 * level changes, the corresponding counter is incremented.
 *
 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
 * node associativity levels have changed.
 *
 * Returns the number of cpus with unhandled associativity changes.
 */
static int update_cpu_associativity_changes_mask(void)
{
	int cpu;
	cpumask_t *changes = &cpu_associativity_changes_mask;

	for_each_possible_cpu(cpu) {
		int i, changed = 0;
		u8 *counts = vphn_cpu_change_counts[cpu];
		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;

		for (i = 0; i < distance_ref_points_depth; i++) {
			if (hypervisor_counts[i] != counts[i]) {
				counts[i] = hypervisor_counts[i];
				changed = 1;
			}
		}
		if (changed) {
			cpumask_or(changes, changes, cpu_sibling_mask(cpu));
			cpu = cpu_last_thread_sibling(cpu);
		}
	}

	return cpumask_weight(changes);
}

/*
 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
 * the complete property we have to add the length in the first cell.
 */
#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)

/*
 * Convert the associativity domain numbers returned from the hypervisor
 * to the sequence they would appear in the ibm,associativity property.
 */
static int vphn_unpack_associativity(const long *packed, __be32 *unpacked)
{
	int i, nr_assoc_doms = 0;
	const __be16 *field = (const __be16 *) packed;

#define VPHN_FIELD_UNUSED	(0xffff)
#define VPHN_FIELD_MSB		(0x8000)
#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)

	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
		if (be16_to_cpup(field) == VPHN_FIELD_UNUSED) {
			/* All significant fields processed, and remaining
			 * fields contain the reserved value of all 1's.
			 * Just store them.
			 */
			unpacked[i] = *((__be32 *)field);
			field += 2;
		} else if (be16_to_cpup(field) & VPHN_FIELD_MSB) {
			/* Data is in the lower 15 bits of this field */
			unpacked[i] = cpu_to_be32(
				be16_to_cpup(field) & VPHN_FIELD_MASK);
			field++;
			nr_assoc_doms++;
		} else {
			/* Data is in the lower 15 bits of this field
			 * concatenated with the next 16 bit field
			 */
			unpacked[i] = *((__be32 *)field);
			field += 2;
			nr_assoc_doms++;
		}
	}

	/* The first cell contains the length of the property */
	unpacked[0] = cpu_to_be32(nr_assoc_doms);

	return nr_assoc_doms;
}

/*
 * Retrieve the new associativity information for a virtual processor's
 * home node.
 */
static long hcall_vphn(unsigned long cpu, __be32 *associativity)
{
	long rc;
	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
	u64 flags = 1;
	int hwcpu = get_hard_smp_processor_id(cpu);
	int i;

	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
	for (i = 0; i < 6; i++)
		retbuf[i] = cpu_to_be64(retbuf[i]);
	vphn_unpack_associativity(retbuf, associativity);

	return rc;
}

static long vphn_get_associativity(unsigned long cpu,
					__be32 *associativity)
{
	long rc;

	rc = hcall_vphn(cpu, associativity);

	switch (rc) {
	case H_FUNCTION:
		printk(KERN_INFO
			"VPHN is not supported. Disabling polling...\n");
		stop_topology_update();
		break;
	case H_HARDWARE:
		printk(KERN_ERR
			"hcall_vphn() experienced a hardware fault "
			"preventing VPHN. Disabling polling...\n");
		stop_topology_update();
	}

	return rc;
}

/*
 * Update the CPU maps and sysfs entries for a single CPU when its NUMA
 * characteristics change. This function doesn't perform any locking and is
 * only safe to call from stop_machine().
 */
static int update_cpu_topology(void *data)
{
	struct topology_update_data *update;
	unsigned long cpu;

	if (!data)
		return -EINVAL;

	cpu = smp_processor_id();

	for (update = data; update; update = update->next) {
		int new_nid = update->new_nid;
		if (cpu != update->cpu)
			continue;

		unmap_cpu_from_node(cpu);
		map_cpu_to_node(cpu, new_nid);
		set_cpu_numa_node(cpu, new_nid);
		set_cpu_numa_mem(cpu, local_memory_node(new_nid));
		vdso_getcpu_init();
	}

	return 0;
}

static int update_lookup_table(void *data)
{
	struct topology_update_data *update;

	if (!data)
		return -EINVAL;

	/*
	 * Upon topology update, the numa-cpu lookup table needs to be updated
	 * for all threads in the core, including offline CPUs, to ensure that
	 * future hotplug operations respect the cpu-to-node associativity
	 * properly.
	 */
	for (update = data; update; update = update->next) {
		int nid, base, j;

		nid = update->new_nid;
		base = cpu_first_thread_sibling(update->cpu);

		for (j = 0; j < threads_per_core; j++) {
			update_numa_cpu_lookup_table(base + j, nid);
		}
	}

	return 0;
}

/*
 * Update the node maps and sysfs entries for each cpu whose home node
 * has changed. Returns 1 when the topology has changed, and 0 otherwise.
 */
int arch_update_cpu_topology(void)
{
	unsigned int cpu, sibling, changed = 0;
	struct topology_update_data *updates, *ud;
	__be32 associativity[VPHN_ASSOC_BUFSIZE] = {0};
	cpumask_t updated_cpus;
	struct device *dev;
	int weight, new_nid, i = 0;

	if (!prrn_enabled && !vphn_enabled)
		return 0;

	weight = cpumask_weight(&cpu_associativity_changes_mask);
	if (!weight)
		return 0;

	updates = kzalloc(weight * (sizeof(*updates)), GFP_KERNEL);
	if (!updates)
		return 0;

	cpumask_clear(&updated_cpus);

	for_each_cpu(cpu, &cpu_associativity_changes_mask) {
		/*
		 * If siblings aren't flagged for changes, updates list
		 * will be too short. Skip on this update and set for next
		 * update.
		 */
		if (!cpumask_subset(cpu_sibling_mask(cpu),
					&cpu_associativity_changes_mask)) {
			pr_info("Sibling bits not set for associativity "
					"change, cpu%d\n", cpu);
			cpumask_or(&cpu_associativity_changes_mask,
					&cpu_associativity_changes_mask,
					cpu_sibling_mask(cpu));
			cpu = cpu_last_thread_sibling(cpu);
			continue;
		}

		/* Use associativity from first thread for all siblings */
		vphn_get_associativity(cpu, associativity);
		new_nid = associativity_to_nid(associativity);
		if (new_nid < 0 || !node_online(new_nid))
			new_nid = first_online_node;

		if (new_nid == numa_cpu_lookup_table[cpu]) {
			cpumask_andnot(&cpu_associativity_changes_mask,
					&cpu_associativity_changes_mask,
					cpu_sibling_mask(cpu));
			cpu = cpu_last_thread_sibling(cpu);
			continue;
		}

		for_each_cpu(sibling, cpu_sibling_mask(cpu)) {
			ud = &updates[i++];
			ud->cpu = sibling;
			ud->new_nid = new_nid;
			ud->old_nid = numa_cpu_lookup_table[sibling];
			cpumask_set_cpu(sibling, &updated_cpus);
			if (i < weight)
				ud->next = &updates[i];
		}
		cpu = cpu_last_thread_sibling(cpu);
	}

	pr_debug("Topology update for the following CPUs:\n");
	if (cpumask_weight(&updated_cpus)) {
		for (ud = &updates[0]; ud; ud = ud->next) {
			pr_debug("cpu %d moving from node %d "
					  "to %d\n", ud->cpu,
					  ud->old_nid, ud->new_nid);
		}
	}

	/*
	 * In cases where we have nothing to update (because the updates list
	 * is too short or because the new topology is same as the old one),
	 * skip invoking update_cpu_topology() via stop-machine(). This is
	 * necessary (and not just a fast-path optimization) since stop-machine
	 * can end up electing a random CPU to run update_cpu_topology(), and
	 * thus trick us into setting up incorrect cpu-node mappings (since
	 * 'updates' is kzalloc()'ed).
	 *
	 * And for the similar reason, we will skip all the following updating.
	 */
	if (!cpumask_weight(&updated_cpus))
		goto out;

	stop_machine(update_cpu_topology, &updates[0], &updated_cpus);

	/*
	 * Update the numa-cpu lookup table with the new mappings, even for
	 * offline CPUs. It is best to perform this update from the stop-
	 * machine context.
	 */
	stop_machine(update_lookup_table, &updates[0],
					cpumask_of(raw_smp_processor_id()));

	for (ud = &updates[0]; ud; ud = ud->next) {
		unregister_cpu_under_node(ud->cpu, ud->old_nid);
		register_cpu_under_node(ud->cpu, ud->new_nid);

		dev = get_cpu_device(ud->cpu);
		if (dev)
			kobject_uevent(&dev->kobj, KOBJ_CHANGE);
		cpumask_clear_cpu(ud->cpu, &cpu_associativity_changes_mask);
		changed = 1;
	}

out:
	kfree(updates);
	return changed;
}

static void topology_work_fn(struct work_struct *work)
{
	rebuild_sched_domains();
}
static DECLARE_WORK(topology_work, topology_work_fn);

static void topology_schedule_update(void)
{
	schedule_work(&topology_work);
}

static void topology_timer_fn(unsigned long ignored)
{
	if (prrn_enabled && cpumask_weight(&cpu_associativity_changes_mask))
		topology_schedule_update();
	else if (vphn_enabled) {
		if (update_cpu_associativity_changes_mask() > 0)
			topology_schedule_update();
		reset_topology_timer();
	}
}
static struct timer_list topology_timer =
	TIMER_INITIALIZER(topology_timer_fn, 0, 0);

static void reset_topology_timer(void)
{
	topology_timer.data = 0;
	topology_timer.expires = jiffies + 60 * HZ;
	mod_timer(&topology_timer, topology_timer.expires);
}

#ifdef CONFIG_SMP

static void stage_topology_update(int core_id)
{
	cpumask_or(&cpu_associativity_changes_mask,
		&cpu_associativity_changes_mask, cpu_sibling_mask(core_id));
	reset_topology_timer();
}

static int dt_update_callback(struct notifier_block *nb,
				unsigned long action, void *data)
{
	struct of_prop_reconfig *update;
	int rc = NOTIFY_DONE;

	switch (action) {
	case OF_RECONFIG_UPDATE_PROPERTY:
		update = (struct of_prop_reconfig *)data;
		if (!of_prop_cmp(update->dn->type, "cpu") &&
		    !of_prop_cmp(update->prop->name, "ibm,associativity")) {
			u32 core_id;
			of_property_read_u32(update->dn, "reg", &core_id);
			stage_topology_update(core_id);
			rc = NOTIFY_OK;
		}
		break;
	}

	return rc;
}

static struct notifier_block dt_update_nb = {
	.notifier_call = dt_update_callback,
};

#endif

/*
 * Start polling for associativity changes.
 */
int start_topology_update(void)
{
	int rc = 0;

	if (firmware_has_feature(FW_FEATURE_PRRN)) {
		if (!prrn_enabled) {
			prrn_enabled = 1;
			vphn_enabled = 0;
#ifdef CONFIG_SMP
			rc = of_reconfig_notifier_register(&dt_update_nb);
#endif
		}
	} else if (firmware_has_feature(FW_FEATURE_VPHN) &&
		   lppaca_shared_proc(get_lppaca())) {
		if (!vphn_enabled) {
			prrn_enabled = 0;
			vphn_enabled = 1;
			setup_cpu_associativity_change_counters();
			init_timer_deferrable(&topology_timer);
			reset_topology_timer();
		}
	}

	return rc;
}

/*
 * Disable polling for VPHN associativity changes.
 */
int stop_topology_update(void)
{
	int rc = 0;

	if (prrn_enabled) {
		prrn_enabled = 0;
#ifdef CONFIG_SMP
		rc = of_reconfig_notifier_unregister(&dt_update_nb);
#endif
	} else if (vphn_enabled) {
		vphn_enabled = 0;
		rc = del_timer_sync(&topology_timer);
	}

	return rc;
}

int prrn_is_enabled(void)
{
	return prrn_enabled;
}

static int topology_read(struct seq_file *file, void *v)
{
	if (vphn_enabled || prrn_enabled)
		seq_puts(file, "on\n");
	else
		seq_puts(file, "off\n");

	return 0;
}

static int topology_open(struct inode *inode, struct file *file)
{
	return single_open(file, topology_read, NULL);
}

static ssize_t topology_write(struct file *file, const char __user *buf,
			      size_t count, loff_t *off)
{
	char kbuf[4]; /* "on" or "off" plus null. */
	int read_len;

	read_len = count < 3 ? count : 3;
	if (copy_from_user(kbuf, buf, read_len))
		return -EINVAL;

	kbuf[read_len] = '\0';

	if (!strncmp(kbuf, "on", 2))
		start_topology_update();
	else if (!strncmp(kbuf, "off", 3))
		stop_topology_update();
	else
		return -EINVAL;

	return count;
}

static const struct file_operations topology_ops = {
	.read = seq_read,
	.write = topology_write,
	.open = topology_open,
	.release = single_release
};

static int topology_update_init(void)
{
	/* Do not poll for changes if disabled at boot */
	if (topology_updates_enabled)
		start_topology_update();

	if (!proc_create("powerpc/topology_updates", 0644, NULL, &topology_ops))
		return -ENOMEM;

	return 0;
}
device_initcall(topology_update_init);
#endif /* CONFIG_PPC_SPLPAR */