CoCalc -- numa.c

GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/powerpc/mm/numa.c
¹⁰⁸¹⁷ views
1
/*
2
 * pSeries NUMA support
3
 *
4
 * Copyright (C) 2002 Anton Blanchard <[email protected]>, IBM
5
 *
6
 * This program is free software; you can redistribute it and/or
7
 * modify it under the terms of the GNU General Public License
8
 * as published by the Free Software Foundation; either version
9
 * 2 of the License, or (at your option) any later version.
10
 */
11
#include <linux/threads.h>
12
#include <linux/bootmem.h>
13
#include <linux/init.h>
14
#include <linux/mm.h>
15
#include <linux/mmzone.h>
16
#include <linux/module.h>
17
#include <linux/nodemask.h>
18
#include <linux/cpu.h>
19
#include <linux/notifier.h>
20
#include <linux/memblock.h>
21
#include <linux/of.h>
22
#include <linux/pfn.h>
23
#include <linux/cpuset.h>
24
#include <linux/node.h>
25
#include <asm/sparsemem.h>
26
#include <asm/prom.h>
27
#include <asm/system.h>
28
#include <asm/smp.h>
29
#include <asm/firmware.h>
30
#include <asm/paca.h>
31
#include <asm/hvcall.h>
32

33
static int numa_enabled = 1;
34

35
static char *cmdline __initdata;
36

37
static int numa_debug;
38
#define dbg(args...) if (numa_debug) { printk(KERN_INFO args); }
39

40
int numa_cpu_lookup_table[NR_CPUS];
41
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
42
struct pglist_data *node_data[MAX_NUMNODES];
43

44
EXPORT_SYMBOL(numa_cpu_lookup_table);
45
EXPORT_SYMBOL(node_to_cpumask_map);
46
EXPORT_SYMBOL(node_data);
47

48
static int min_common_depth;
49
static int n_mem_addr_cells, n_mem_size_cells;
50
static int form1_affinity;
51

52
#define MAX_DISTANCE_REF_POINTS 4
53
static int distance_ref_points_depth;
54
static const unsigned int *distance_ref_points;
55
static int distance_lookup_table[MAX_NUMNODES][MAX_DISTANCE_REF_POINTS];
56

57
/*
58
 * Allocate node_to_cpumask_map based on number of available nodes
59
 * Requires node_possible_map to be valid.
60
 *
61
 * Note: node_to_cpumask() is not valid until after this is done.
62
 */
63
static void __init setup_node_to_cpumask_map(void)
64
{
65
	unsigned int node, num = 0;
66

67
	/* setup nr_node_ids if not done yet */
68
	if (nr_node_ids == MAX_NUMNODES) {
69
		for_each_node_mask(node, node_possible_map)
70
			num = node;
71
		nr_node_ids = num + 1;
72
	}
73

74
	/* allocate the map */
75
	for (node = 0; node < nr_node_ids; node++)
76
		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
77

78
	/* cpumask_of_node() will now work */
79
	dbg("Node to cpumask map for %d nodes\n", nr_node_ids);
80
}
81

82
static int __cpuinit fake_numa_create_new_node(unsigned long end_pfn,
83
						unsigned int *nid)
84
{
85
	unsigned long long mem;
86
	char *p = cmdline;
87
	static unsigned int fake_nid;
88
	static unsigned long long curr_boundary;
89

90
	/*
91
	 * Modify node id, iff we started creating NUMA nodes
92
	 * We want to continue from where we left of the last time
93
	 */
94
	if (fake_nid)
95
		*nid = fake_nid;
96
	/*
97
	 * In case there are no more arguments to parse, the
98
	 * node_id should be the same as the last fake node id
99
	 * (we've handled this above).
100
	 */
101
	if (!p)
102
		return 0;
103

104
	mem = memparse(p, &p);
105
	if (!mem)
106
		return 0;
107

108
	if (mem < curr_boundary)
109
		return 0;
110

111
	curr_boundary = mem;
112

113
	if ((end_pfn << PAGE_SHIFT) > mem) {
114
		/*
115
		 * Skip commas and spaces
116
		 */
117
		while (*p == ',' || *p == ' ' || *p == '\t')
118
			p++;
119

120
		cmdline = p;
121
		fake_nid++;
122
		*nid = fake_nid;
123
		dbg("created new fake_node with id %d\n", fake_nid);
124
		return 1;
125
	}
126
	return 0;
127
}
128

129
/*
130
 * get_active_region_work_fn - A helper function for get_node_active_region
131
 *	Returns datax set to the start_pfn and end_pfn if they contain
132
 *	the initial value of datax->start_pfn between them
133
 * @start_pfn: start page(inclusive) of region to check
134
 * @end_pfn: end page(exclusive) of region to check
135
 * @datax: comes in with ->start_pfn set to value to search for and
136
 *	goes out with active range if it contains it
137
 * Returns 1 if search value is in range else 0
138
 */
139
static int __init get_active_region_work_fn(unsigned long start_pfn,
140
					unsigned long end_pfn, void *datax)
141
{
142
	struct node_active_region *data;
143
	data = (struct node_active_region *)datax;
144

145
	if (start_pfn <= data->start_pfn && end_pfn > data->start_pfn) {
146
		data->start_pfn = start_pfn;
147
		data->end_pfn = end_pfn;
148
		return 1;
149
	}
150
	return 0;
151

152
}
153

154
/*
155
 * get_node_active_region - Return active region containing start_pfn
156
 * Active range returned is empty if none found.
157
 * @start_pfn: The page to return the region for.
158
 * @node_ar: Returned set to the active region containing start_pfn
159
 */
160
static void __init get_node_active_region(unsigned long start_pfn,
161
		       struct node_active_region *node_ar)
162
{
163
	int nid = early_pfn_to_nid(start_pfn);
164

165
	node_ar->nid = nid;
166
	node_ar->start_pfn = start_pfn;
167
	node_ar->end_pfn = start_pfn;
168
	work_with_active_regions(nid, get_active_region_work_fn, node_ar);
169
}
170

171
static void map_cpu_to_node(int cpu, int node)
172
{
173
	numa_cpu_lookup_table[cpu] = node;
174

175
	dbg("adding cpu %d to node %d\n", cpu, node);
176

177
	if (!(cpumask_test_cpu(cpu, node_to_cpumask_map[node])))
178
		cpumask_set_cpu(cpu, node_to_cpumask_map[node]);
179
}
180

181
#if defined(CONFIG_HOTPLUG_CPU) || defined(CONFIG_PPC_SPLPAR)
182
static void unmap_cpu_from_node(unsigned long cpu)
183
{
184
	int node = numa_cpu_lookup_table[cpu];
185

186
	dbg("removing cpu %lu from node %d\n", cpu, node);
187

188
	if (cpumask_test_cpu(cpu, node_to_cpumask_map[node])) {
189
		cpumask_clear_cpu(cpu, node_to_cpumask_map[node]);
190
	} else {
191
		printk(KERN_ERR "WARNING: cpu %lu not found in node %d\n",
192
		       cpu, node);
193
	}
194
}
195
#endif /* CONFIG_HOTPLUG_CPU || CONFIG_PPC_SPLPAR */
196

197
/* must hold reference to node during call */
198
static const int *of_get_associativity(struct device_node *dev)
199
{
200
	return of_get_property(dev, "ibm,associativity", NULL);
201
}
202

203
/*
204
 * Returns the property linux,drconf-usable-memory if
205
 * it exists (the property exists only in kexec/kdump kernels,
206
 * added by kexec-tools)
207
 */
208
static const u32 *of_get_usable_memory(struct device_node *memory)
209
{
210
	const u32 *prop;
211
	u32 len;
212
	prop = of_get_property(memory, "linux,drconf-usable-memory", &len);
213
	if (!prop || len < sizeof(unsigned int))
214
		return 0;
215
	return prop;
216
}
217

218
int __node_distance(int a, int b)
219
{
220
	int i;
221
	int distance = LOCAL_DISTANCE;
222

223
	if (!form1_affinity)
224
		return distance;
225

226
	for (i = 0; i < distance_ref_points_depth; i++) {
227
		if (distance_lookup_table[a][i] == distance_lookup_table[b][i])
228
			break;
229

230
		/* Double the distance for each NUMA level */
231
		distance *= 2;
232
	}
233

234
	return distance;
235
}
236

237
static void initialize_distance_lookup_table(int nid,
238
		const unsigned int *associativity)
239
{
240
	int i;
241

242
	if (!form1_affinity)
243
		return;
244

245
	for (i = 0; i < distance_ref_points_depth; i++) {
246
		distance_lookup_table[nid][i] =
247
			associativity[distance_ref_points[i]];
248
	}
249
}
250

251
/* Returns nid in the range [0..MAX_NUMNODES-1], or -1 if no useful numa
252
 * info is found.
253
 */
254
static int associativity_to_nid(const unsigned int *associativity)
255
{
256
	int nid = -1;
257

258
	if (min_common_depth == -1)
259
		goto out;
260

261
	if (associativity[0] >= min_common_depth)
262
		nid = associativity[min_common_depth];
263

264
	/* POWER4 LPAR uses 0xffff as invalid node */
265
	if (nid == 0xffff || nid >= MAX_NUMNODES)
266
		nid = -1;
267

268
	if (nid > 0 && associativity[0] >= distance_ref_points_depth)
269
		initialize_distance_lookup_table(nid, associativity);
270

271
out:
272
	return nid;
273
}
274

275
/* Returns the nid associated with the given device tree node,
276
 * or -1 if not found.
277
 */
278
static int of_node_to_nid_single(struct device_node *device)
279
{
280
	int nid = -1;
281
	const unsigned int *tmp;
282

283
	tmp = of_get_associativity(device);
284
	if (tmp)
285
		nid = associativity_to_nid(tmp);
286
	return nid;
287
}
288

289
/* Walk the device tree upwards, looking for an associativity id */
290
int of_node_to_nid(struct device_node *device)
291
{
292
	struct device_node *tmp;
293
	int nid = -1;
294

295
	of_node_get(device);
296
	while (device) {
297
		nid = of_node_to_nid_single(device);
298
		if (nid != -1)
299
			break;
300

301
	        tmp = device;
302
		device = of_get_parent(tmp);
303
		of_node_put(tmp);
304
	}
305
	of_node_put(device);
306

307
	return nid;
308
}
309
EXPORT_SYMBOL_GPL(of_node_to_nid);
310

311
static int __init find_min_common_depth(void)
312
{
313
	int depth;
314
	struct device_node *chosen;
315
	struct device_node *root;
316
	const char *vec5;
317

318
	root = of_find_node_by_path("/rtas");
319
	if (!root)
320
		root = of_find_node_by_path("/");
321

322
	/*
323
	 * This property is a set of 32-bit integers, each representing
324
	 * an index into the ibm,associativity nodes.
325
	 *
326
	 * With form 0 affinity the first integer is for an SMP configuration
327
	 * (should be all 0's) and the second is for a normal NUMA
328
	 * configuration. We have only one level of NUMA.
329
	 *
330
	 * With form 1 affinity the first integer is the most significant
331
	 * NUMA boundary and the following are progressively less significant
332
	 * boundaries. There can be more than one level of NUMA.
333
	 */
334
	distance_ref_points = of_get_property(root,
335
					"ibm,associativity-reference-points",
336
					&distance_ref_points_depth);
337

338
	if (!distance_ref_points) {
339
		dbg("NUMA: ibm,associativity-reference-points not found.\n");
340
		goto err;
341
	}
342

343
	distance_ref_points_depth /= sizeof(int);
344

345
#define VEC5_AFFINITY_BYTE	5
346
#define VEC5_AFFINITY		0x80
347
	chosen = of_find_node_by_path("/chosen");
348
	if (chosen) {
349
		vec5 = of_get_property(chosen, "ibm,architecture-vec-5", NULL);
350
		if (vec5 && (vec5[VEC5_AFFINITY_BYTE] & VEC5_AFFINITY)) {
351
			dbg("Using form 1 affinity\n");
352
			form1_affinity = 1;
353
		}
354
	}
355

356
	if (form1_affinity) {
357
		depth = distance_ref_points[0];
358
	} else {
359
		if (distance_ref_points_depth < 2) {
360
			printk(KERN_WARNING "NUMA: "
361
				"short ibm,associativity-reference-points\n");
362
			goto err;
363
		}
364

365
		depth = distance_ref_points[1];
366
	}
367

368
	/*
369
	 * Warn and cap if the hardware supports more than
370
	 * MAX_DISTANCE_REF_POINTS domains.
371
	 */
372
	if (distance_ref_points_depth > MAX_DISTANCE_REF_POINTS) {
373
		printk(KERN_WARNING "NUMA: distance array capped at "
374
			"%d entries\n", MAX_DISTANCE_REF_POINTS);
375
		distance_ref_points_depth = MAX_DISTANCE_REF_POINTS;
376
	}
377

378
	of_node_put(root);
379
	return depth;
380

381
err:
382
	of_node_put(root);
383
	return -1;
384
}
385

386
static void __init get_n_mem_cells(int *n_addr_cells, int *n_size_cells)
387
{
388
	struct device_node *memory = NULL;
389

390
	memory = of_find_node_by_type(memory, "memory");
391
	if (!memory)
392
		panic("numa.c: No memory nodes found!");
393

394
	*n_addr_cells = of_n_addr_cells(memory);
395
	*n_size_cells = of_n_size_cells(memory);
396
	of_node_put(memory);
397
}
398

399
static unsigned long __devinit read_n_cells(int n, const unsigned int **buf)
400
{
401
	unsigned long result = 0;
402

403
	while (n--) {
404
		result = (result << 32) | **buf;
405
		(*buf)++;
406
	}
407
	return result;
408
}
409

410
struct of_drconf_cell {
411
	u64	base_addr;
412
	u32	drc_index;
413
	u32	reserved;
414
	u32	aa_index;
415
	u32	flags;
416
};
417

418
#define DRCONF_MEM_ASSIGNED	0x00000008
419
#define DRCONF_MEM_AI_INVALID	0x00000040
420
#define DRCONF_MEM_RESERVED	0x00000080
421

422
/*
423
 * Read the next memblock list entry from the ibm,dynamic-memory property
424
 * and return the information in the provided of_drconf_cell structure.
425
 */
426
static void read_drconf_cell(struct of_drconf_cell *drmem, const u32 **cellp)
427
{
428
	const u32 *cp;
429

430
	drmem->base_addr = read_n_cells(n_mem_addr_cells, cellp);
431

432
	cp = *cellp;
433
	drmem->drc_index = cp[0];
434
	drmem->reserved = cp[1];
435
	drmem->aa_index = cp[2];
436
	drmem->flags = cp[3];
437

438
	*cellp = cp + 4;
439
}
440

441
/*
442
 * Retrieve and validate the ibm,dynamic-memory property of the device tree.
443
 *
444
 * The layout of the ibm,dynamic-memory property is a number N of memblock
445
 * list entries followed by N memblock list entries.  Each memblock list entry
446
 * contains information as laid out in the of_drconf_cell struct above.
447
 */
448
static int of_get_drconf_memory(struct device_node *memory, const u32 **dm)
449
{
450
	const u32 *prop;
451
	u32 len, entries;
452

453
	prop = of_get_property(memory, "ibm,dynamic-memory", &len);
454
	if (!prop || len < sizeof(unsigned int))
455
		return 0;
456

457
	entries = *prop++;
458

459
	/* Now that we know the number of entries, revalidate the size
460
	 * of the property read in to ensure we have everything
461
	 */
462
	if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
463
		return 0;
464

465
	*dm = prop;
466
	return entries;
467
}
468

469
/*
470
 * Retrieve and validate the ibm,lmb-size property for drconf memory
471
 * from the device tree.
472
 */
473
static u64 of_get_lmb_size(struct device_node *memory)
474
{
475
	const u32 *prop;
476
	u32 len;
477

478
	prop = of_get_property(memory, "ibm,lmb-size", &len);
479
	if (!prop || len < sizeof(unsigned int))
480
		return 0;
481

482
	return read_n_cells(n_mem_size_cells, &prop);
483
}
484

485
struct assoc_arrays {
486
	u32	n_arrays;
487
	u32	array_sz;
488
	const u32 *arrays;
489
};
490

491
/*
492
 * Retrieve and validate the list of associativity arrays for drconf
493
 * memory from the ibm,associativity-lookup-arrays property of the
494
 * device tree..
495
 *
496
 * The layout of the ibm,associativity-lookup-arrays property is a number N
497
 * indicating the number of associativity arrays, followed by a number M
498
 * indicating the size of each associativity array, followed by a list
499
 * of N associativity arrays.
500
 */
501
static int of_get_assoc_arrays(struct device_node *memory,
502
			       struct assoc_arrays *aa)
503
{
504
	const u32 *prop;
505
	u32 len;
506

507
	prop = of_get_property(memory, "ibm,associativity-lookup-arrays", &len);
508
	if (!prop || len < 2 * sizeof(unsigned int))
509
		return -1;
510

511
	aa->n_arrays = *prop++;
512
	aa->array_sz = *prop++;
513

514
	/* Now that we know the number of arrrays and size of each array,
515
	 * revalidate the size of the property read in.
516
	 */
517
	if (len < (aa->n_arrays * aa->array_sz + 2) * sizeof(unsigned int))
518
		return -1;
519

520
	aa->arrays = prop;
521
	return 0;
522
}
523

524
/*
525
 * This is like of_node_to_nid_single() for memory represented in the
526
 * ibm,dynamic-reconfiguration-memory node.
527
 */
528
static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
529
				   struct assoc_arrays *aa)
530
{
531
	int default_nid = 0;
532
	int nid = default_nid;
533
	int index;
534

535
	if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
536
	    !(drmem->flags & DRCONF_MEM_AI_INVALID) &&
537
	    drmem->aa_index < aa->n_arrays) {
538
		index = drmem->aa_index * aa->array_sz + min_common_depth - 1;
539
		nid = aa->arrays[index];
540

541
		if (nid == 0xffff || nid >= MAX_NUMNODES)
542
			nid = default_nid;
543
	}
544

545
	return nid;
546
}
547

548
/*
549
 * Figure out to which domain a cpu belongs and stick it there.
550
 * Return the id of the domain used.
551
 */
552
static int __cpuinit numa_setup_cpu(unsigned long lcpu)
553
{
554
	int nid = 0;
555
	struct device_node *cpu = of_get_cpu_node(lcpu, NULL);
556

557
	if (!cpu) {
558
		WARN_ON(1);
559
		goto out;
560
	}
561

562
	nid = of_node_to_nid_single(cpu);
563

564
	if (nid < 0 || !node_online(nid))
565
		nid = first_online_node;
566
out:
567
	map_cpu_to_node(lcpu, nid);
568

569
	of_node_put(cpu);
570

571
	return nid;
572
}
573

574
static int __cpuinit cpu_numa_callback(struct notifier_block *nfb,
575
			     unsigned long action,
576
			     void *hcpu)
577
{
578
	unsigned long lcpu = (unsigned long)hcpu;
579
	int ret = NOTIFY_DONE;
580

581
	switch (action) {
582
	case CPU_UP_PREPARE:
583
	case CPU_UP_PREPARE_FROZEN:
584
		numa_setup_cpu(lcpu);
585
		ret = NOTIFY_OK;
586
		break;
587
#ifdef CONFIG_HOTPLUG_CPU
588
	case CPU_DEAD:
589
	case CPU_DEAD_FROZEN:
590
	case CPU_UP_CANCELED:
591
	case CPU_UP_CANCELED_FROZEN:
592
		unmap_cpu_from_node(lcpu);
593
		break;
594
		ret = NOTIFY_OK;
595
#endif
596
	}
597
	return ret;
598
}
599

600
/*
601
 * Check and possibly modify a memory region to enforce the memory limit.
602
 *
603
 * Returns the size the region should have to enforce the memory limit.
604
 * This will either be the original value of size, a truncated value,
605
 * or zero. If the returned value of size is 0 the region should be
606
 * discarded as it lies wholly above the memory limit.
607
 */
608
static unsigned long __init numa_enforce_memory_limit(unsigned long start,
609
						      unsigned long size)
610
{
611
	/*
612
	 * We use memblock_end_of_DRAM() in here instead of memory_limit because
613
	 * we've already adjusted it for the limit and it takes care of
614
	 * having memory holes below the limit.  Also, in the case of
615
	 * iommu_is_off, memory_limit is not set but is implicitly enforced.
616
	 */
617

618
	if (start + size <= memblock_end_of_DRAM())
619
		return size;
620

621
	if (start >= memblock_end_of_DRAM())
622
		return 0;
623

624
	return memblock_end_of_DRAM() - start;
625
}
626

627
/*
628
 * Reads the counter for a given entry in
629
 * linux,drconf-usable-memory property
630
 */
631
static inline int __init read_usm_ranges(const u32 **usm)
632
{
633
	/*
634
	 * For each lmb in ibm,dynamic-memory a corresponding
635
	 * entry in linux,drconf-usable-memory property contains
636
	 * a counter followed by that many (base, size) duple.
637
	 * read the counter from linux,drconf-usable-memory
638
	 */
639
	return read_n_cells(n_mem_size_cells, usm);
640
}
641

642
/*
643
 * Extract NUMA information from the ibm,dynamic-reconfiguration-memory
644
 * node.  This assumes n_mem_{addr,size}_cells have been set.
645
 */
646
static void __init parse_drconf_memory(struct device_node *memory)
647
{
648
	const u32 *dm, *usm;
649
	unsigned int n, rc, ranges, is_kexec_kdump = 0;
650
	unsigned long lmb_size, base, size, sz;
651
	int nid;
652
	struct assoc_arrays aa;
653

654
	n = of_get_drconf_memory(memory, &dm);
655
	if (!n)
656
		return;
657

658
	lmb_size = of_get_lmb_size(memory);
659
	if (!lmb_size)
660
		return;
661

662
	rc = of_get_assoc_arrays(memory, &aa);
663
	if (rc)
664
		return;
665

666
	/* check if this is a kexec/kdump kernel */
667
	usm = of_get_usable_memory(memory);
668
	if (usm != NULL)
669
		is_kexec_kdump = 1;
670

671
	for (; n != 0; --n) {
672
		struct of_drconf_cell drmem;
673

674
		read_drconf_cell(&drmem, &dm);
675

676
		/* skip this block if the reserved bit is set in flags (0x80)
677
		   or if the block is not assigned to this partition (0x8) */
678
		if ((drmem.flags & DRCONF_MEM_RESERVED)
679
		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
680
			continue;
681

682
		base = drmem.base_addr;
683
		size = lmb_size;
684
		ranges = 1;
685

686
		if (is_kexec_kdump) {
687
			ranges = read_usm_ranges(&usm);
688
			if (!ranges) /* there are no (base, size) duple */
689
				continue;
690
		}
691
		do {
692
			if (is_kexec_kdump) {
693
				base = read_n_cells(n_mem_addr_cells, &usm);
694
				size = read_n_cells(n_mem_size_cells, &usm);
695
			}
696
			nid = of_drconf_to_nid_single(&drmem, &aa);
697
			fake_numa_create_new_node(
698
				((base + size) >> PAGE_SHIFT),
699
					   &nid);
700
			node_set_online(nid);
701
			sz = numa_enforce_memory_limit(base, size);
702
			if (sz)
703
				add_active_range(nid, base >> PAGE_SHIFT,
704
						 (base >> PAGE_SHIFT)
705
						 + (sz >> PAGE_SHIFT));
706
		} while (--ranges);
707
	}
708
}
709

710
static int __init parse_numa_properties(void)
711
{
712
	struct device_node *cpu = NULL;
713
	struct device_node *memory = NULL;
714
	int default_nid = 0;
715
	unsigned long i;
716

717
	if (numa_enabled == 0) {
718
		printk(KERN_WARNING "NUMA disabled by user\n");
719
		return -1;
720
	}
721

722
	min_common_depth = find_min_common_depth();
723

724
	if (min_common_depth < 0)
725
		return min_common_depth;
726

727
	dbg("NUMA associativity depth for CPU/Memory: %d\n", min_common_depth);
728

729
	/*
730
	 * Even though we connect cpus to numa domains later in SMP
731
	 * init, we need to know the node ids now. This is because
732
	 * each node to be onlined must have NODE_DATA etc backing it.
733
	 */
734
	for_each_present_cpu(i) {
735
		int nid;
736

737
		cpu = of_get_cpu_node(i, NULL);
738
		BUG_ON(!cpu);
739
		nid = of_node_to_nid_single(cpu);
740
		of_node_put(cpu);
741

742
		/*
743
		 * Don't fall back to default_nid yet -- we will plug
744
		 * cpus into nodes once the memory scan has discovered
745
		 * the topology.
746
		 */
747
		if (nid < 0)
748
			continue;
749
		node_set_online(nid);
750
	}
751

752
	get_n_mem_cells(&n_mem_addr_cells, &n_mem_size_cells);
753
	memory = NULL;
754
	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
755
		unsigned long start;
756
		unsigned long size;
757
		int nid;
758
		int ranges;
759
		const unsigned int *memcell_buf;
760
		unsigned int len;
761

762
		memcell_buf = of_get_property(memory,
763
			"linux,usable-memory", &len);
764
		if (!memcell_buf || len <= 0)
765
			memcell_buf = of_get_property(memory, "reg", &len);
766
		if (!memcell_buf || len <= 0)
767
			continue;
768

769
		/* ranges in cell */
770
		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
771
new_range:
772
		/* these are order-sensitive, and modify the buffer pointer */
773
		start = read_n_cells(n_mem_addr_cells, &memcell_buf);
774
		size = read_n_cells(n_mem_size_cells, &memcell_buf);
775

776
		/*
777
		 * Assumption: either all memory nodes or none will
778
		 * have associativity properties.  If none, then
779
		 * everything goes to default_nid.
780
		 */
781
		nid = of_node_to_nid_single(memory);
782
		if (nid < 0)
783
			nid = default_nid;
784

785
		fake_numa_create_new_node(((start + size) >> PAGE_SHIFT), &nid);
786
		node_set_online(nid);
787

788
		if (!(size = numa_enforce_memory_limit(start, size))) {
789
			if (--ranges)
790
				goto new_range;
791
			else
792
				continue;
793
		}
794

795
		add_active_range(nid, start >> PAGE_SHIFT,
796
				(start >> PAGE_SHIFT) + (size >> PAGE_SHIFT));
797

798
		if (--ranges)
799
			goto new_range;
800
	}
801

802
	/*
803
	 * Now do the same thing for each MEMBLOCK listed in the ibm,dynamic-memory
804
	 * property in the ibm,dynamic-reconfiguration-memory node.
805
	 */
806
	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
807
	if (memory)
808
		parse_drconf_memory(memory);
809

810
	return 0;
811
}
812

813
static void __init setup_nonnuma(void)
814
{
815
	unsigned long top_of_ram = memblock_end_of_DRAM();
816
	unsigned long total_ram = memblock_phys_mem_size();
817
	unsigned long start_pfn, end_pfn;
818
	unsigned int nid = 0;
819
	struct memblock_region *reg;
820

821
	printk(KERN_DEBUG "Top of RAM: 0x%lx, Total RAM: 0x%lx\n",
822
	       top_of_ram, total_ram);
823
	printk(KERN_DEBUG "Memory hole size: %ldMB\n",
824
	       (top_of_ram - total_ram) >> 20);
825

826
	for_each_memblock(memory, reg) {
827
		start_pfn = memblock_region_memory_base_pfn(reg);
828
		end_pfn = memblock_region_memory_end_pfn(reg);
829

830
		fake_numa_create_new_node(end_pfn, &nid);
831
		add_active_range(nid, start_pfn, end_pfn);
832
		node_set_online(nid);
833
	}
834
}
835

836
void __init dump_numa_cpu_topology(void)
837
{
838
	unsigned int node;
839
	unsigned int cpu, count;
840

841
	if (min_common_depth == -1 || !numa_enabled)
842
		return;
843

844
	for_each_online_node(node) {
845
		printk(KERN_DEBUG "Node %d CPUs:", node);
846

847
		count = 0;
848
		/*
849
		 * If we used a CPU iterator here we would miss printing
850
		 * the holes in the cpumap.
851
		 */
852
		for (cpu = 0; cpu < nr_cpu_ids; cpu++) {
853
			if (cpumask_test_cpu(cpu,
854
					node_to_cpumask_map[node])) {
855
				if (count == 0)
856
					printk(" %u", cpu);
857
				++count;
858
			} else {
859
				if (count > 1)
860
					printk("-%u", cpu - 1);
861
				count = 0;
862
			}
863
		}
864

865
		if (count > 1)
866
			printk("-%u", nr_cpu_ids - 1);
867
		printk("\n");
868
	}
869
}
870

871
static void __init dump_numa_memory_topology(void)
872
{
873
	unsigned int node;
874
	unsigned int count;
875

876
	if (min_common_depth == -1 || !numa_enabled)
877
		return;
878

879
	for_each_online_node(node) {
880
		unsigned long i;
881

882
		printk(KERN_DEBUG "Node %d Memory:", node);
883

884
		count = 0;
885

886
		for (i = 0; i < memblock_end_of_DRAM();
887
		     i += (1 << SECTION_SIZE_BITS)) {
888
			if (early_pfn_to_nid(i >> PAGE_SHIFT) == node) {
889
				if (count == 0)
890
					printk(" 0x%lx", i);
891
				++count;
892
			} else {
893
				if (count > 0)
894
					printk("-0x%lx", i);
895
				count = 0;
896
			}
897
		}
898

899
		if (count > 0)
900
			printk("-0x%lx", i);
901
		printk("\n");
902
	}
903
}
904

905
/*
906
 * Allocate some memory, satisfying the memblock or bootmem allocator where
907
 * required. nid is the preferred node and end is the physical address of
908
 * the highest address in the node.
909
 *
910
 * Returns the virtual address of the memory.
911
 */
912
static void __init *careful_zallocation(int nid, unsigned long size,
913
				       unsigned long align,
914
				       unsigned long end_pfn)
915
{
916
	void *ret;
917
	int new_nid;
918
	unsigned long ret_paddr;
919

920
	ret_paddr = __memblock_alloc_base(size, align, end_pfn << PAGE_SHIFT);
921

922
	/* retry over all memory */
923
	if (!ret_paddr)
924
		ret_paddr = __memblock_alloc_base(size, align, memblock_end_of_DRAM());
925

926
	if (!ret_paddr)
927
		panic("numa.c: cannot allocate %lu bytes for node %d",
928
		      size, nid);
929

930
	ret = __va(ret_paddr);
931

932
	/*
933
	 * We initialize the nodes in numeric order: 0, 1, 2...
934
	 * and hand over control from the MEMBLOCK allocator to the
935
	 * bootmem allocator.  If this function is called for
936
	 * node 5, then we know that all nodes <5 are using the
937
	 * bootmem allocator instead of the MEMBLOCK allocator.
938
	 *
939
	 * So, check the nid from which this allocation came
940
	 * and double check to see if we need to use bootmem
941
	 * instead of the MEMBLOCK.  We don't free the MEMBLOCK memory
942
	 * since it would be useless.
943
	 */
944
	new_nid = early_pfn_to_nid(ret_paddr >> PAGE_SHIFT);
945
	if (new_nid < nid) {
946
		ret = __alloc_bootmem_node(NODE_DATA(new_nid),
947
				size, align, 0);
948

949
		dbg("alloc_bootmem %p %lx\n", ret, size);
950
	}
951

952
	memset(ret, 0, size);
953
	return ret;
954
}
955

956
static struct notifier_block __cpuinitdata ppc64_numa_nb = {
957
	.notifier_call = cpu_numa_callback,
958
	.priority = 1 /* Must run before sched domains notifier. */
959
};
960

961
static void mark_reserved_regions_for_nid(int nid)
962
{
963
	struct pglist_data *node = NODE_DATA(nid);
964
	struct memblock_region *reg;
965

966
	for_each_memblock(reserved, reg) {
967
		unsigned long physbase = reg->base;
968
		unsigned long size = reg->size;
969
		unsigned long start_pfn = physbase >> PAGE_SHIFT;
970
		unsigned long end_pfn = PFN_UP(physbase + size);
971
		struct node_active_region node_ar;
972
		unsigned long node_end_pfn = node->node_start_pfn +
973
					     node->node_spanned_pages;
974

975
		/*
976
		 * Check to make sure that this memblock.reserved area is
977
		 * within the bounds of the node that we care about.
978
		 * Checking the nid of the start and end points is not
979
		 * sufficient because the reserved area could span the
980
		 * entire node.
981
		 */
982
		if (end_pfn <= node->node_start_pfn ||
983
		    start_pfn >= node_end_pfn)
984
			continue;
985

986
		get_node_active_region(start_pfn, &node_ar);
987
		while (start_pfn < end_pfn &&
988
			node_ar.start_pfn < node_ar.end_pfn) {
989
			unsigned long reserve_size = size;
990
			/*
991
			 * if reserved region extends past active region
992
			 * then trim size to active region
993
			 */
994
			if (end_pfn > node_ar.end_pfn)
995
				reserve_size = (node_ar.end_pfn << PAGE_SHIFT)
996
					- physbase;
997
			/*
998
			 * Only worry about *this* node, others may not
999
			 * yet have valid NODE_DATA().
1000
			 */
1001
			if (node_ar.nid == nid) {
1002
				dbg("reserve_bootmem %lx %lx nid=%d\n",
1003
					physbase, reserve_size, node_ar.nid);
1004
				reserve_bootmem_node(NODE_DATA(node_ar.nid),
1005
						physbase, reserve_size,
1006
						BOOTMEM_DEFAULT);
1007
			}
1008
			/*
1009
			 * if reserved region is contained in the active region
1010
			 * then done.
1011
			 */
1012
			if (end_pfn <= node_ar.end_pfn)
1013
				break;
1014

1015
			/*
1016
			 * reserved region extends past the active region
1017
			 *   get next active region that contains this
1018
			 *   reserved region
1019
			 */
1020
			start_pfn = node_ar.end_pfn;
1021
			physbase = start_pfn << PAGE_SHIFT;
1022
			size = size - reserve_size;
1023
			get_node_active_region(start_pfn, &node_ar);
1024
		}
1025
	}
1026
}
1027

1028

1029
void __init do_init_bootmem(void)
1030
{
1031
	int nid;
1032

1033
	min_low_pfn = 0;
1034
	max_low_pfn = memblock_end_of_DRAM() >> PAGE_SHIFT;
1035
	max_pfn = max_low_pfn;
1036

1037
	if (parse_numa_properties())
1038
		setup_nonnuma();
1039
	else
1040
		dump_numa_memory_topology();
1041

1042
	for_each_online_node(nid) {
1043
		unsigned long start_pfn, end_pfn;
1044
		void *bootmem_vaddr;
1045
		unsigned long bootmap_pages;
1046

1047
		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
1048

1049
		/*
1050
		 * Allocate the node structure node local if possible
1051
		 *
1052
		 * Be careful moving this around, as it relies on all
1053
		 * previous nodes' bootmem to be initialized and have
1054
		 * all reserved areas marked.
1055
		 */
1056
		NODE_DATA(nid) = careful_zallocation(nid,
1057
					sizeof(struct pglist_data),
1058
					SMP_CACHE_BYTES, end_pfn);
1059

1060
  		dbg("node %d\n", nid);
1061
		dbg("NODE_DATA() = %p\n", NODE_DATA(nid));
1062

1063
		NODE_DATA(nid)->bdata = &bootmem_node_data[nid];
1064
		NODE_DATA(nid)->node_start_pfn = start_pfn;
1065
		NODE_DATA(nid)->node_spanned_pages = end_pfn - start_pfn;
1066

1067
		if (NODE_DATA(nid)->node_spanned_pages == 0)
1068
  			continue;
1069

1070
  		dbg("start_paddr = %lx\n", start_pfn << PAGE_SHIFT);
1071
  		dbg("end_paddr = %lx\n", end_pfn << PAGE_SHIFT);
1072

1073
		bootmap_pages = bootmem_bootmap_pages(end_pfn - start_pfn);
1074
		bootmem_vaddr = careful_zallocation(nid,
1075
					bootmap_pages << PAGE_SHIFT,
1076
					PAGE_SIZE, end_pfn);
1077

1078
		dbg("bootmap_vaddr = %p\n", bootmem_vaddr);
1079

1080
		init_bootmem_node(NODE_DATA(nid),
1081
				  __pa(bootmem_vaddr) >> PAGE_SHIFT,
1082
				  start_pfn, end_pfn);
1083

1084
		free_bootmem_with_active_regions(nid, end_pfn);
1085
		/*
1086
		 * Be very careful about moving this around.  Future
1087
		 * calls to careful_zallocation() depend on this getting
1088
		 * done correctly.
1089
		 */
1090
		mark_reserved_regions_for_nid(nid);
1091
		sparse_memory_present_with_active_regions(nid);
1092
	}
1093

1094
	init_bootmem_done = 1;
1095

1096
	/*
1097
	 * Now bootmem is initialised we can create the node to cpumask
1098
	 * lookup tables and setup the cpu callback to populate them.
1099
	 */
1100
	setup_node_to_cpumask_map();
1101

1102
	register_cpu_notifier(&ppc64_numa_nb);
1103
	cpu_numa_callback(&ppc64_numa_nb, CPU_UP_PREPARE,
1104
			  (void *)(unsigned long)boot_cpuid);
1105
}
1106

1107
void __init paging_init(void)
1108
{
1109
	unsigned long max_zone_pfns[MAX_NR_ZONES];
1110
	memset(max_zone_pfns, 0, sizeof(max_zone_pfns));
1111
	max_zone_pfns[ZONE_DMA] = memblock_end_of_DRAM() >> PAGE_SHIFT;
1112
	free_area_init_nodes(max_zone_pfns);
1113
}
1114

1115
static int __init early_numa(char *p)
1116
{
1117
	if (!p)
1118
		return 0;
1119

1120
	if (strstr(p, "off"))
1121
		numa_enabled = 0;
1122

1123
	if (strstr(p, "debug"))
1124
		numa_debug = 1;
1125

1126
	p = strstr(p, "fake=");
1127
	if (p)
1128
		cmdline = p + strlen("fake=");
1129

1130
	return 0;
1131
}
1132
early_param("numa", early_numa);
1133

1134
#ifdef CONFIG_MEMORY_HOTPLUG
1135
/*
1136
 * Find the node associated with a hot added memory section for
1137
 * memory represented in the device tree by the property
1138
 * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory.
1139
 */
1140
static int hot_add_drconf_scn_to_nid(struct device_node *memory,
1141
				     unsigned long scn_addr)
1142
{
1143
	const u32 *dm;
1144
	unsigned int drconf_cell_cnt, rc;
1145
	unsigned long lmb_size;
1146
	struct assoc_arrays aa;
1147
	int nid = -1;
1148

1149
	drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1150
	if (!drconf_cell_cnt)
1151
		return -1;
1152

1153
	lmb_size = of_get_lmb_size(memory);
1154
	if (!lmb_size)
1155
		return -1;
1156

1157
	rc = of_get_assoc_arrays(memory, &aa);
1158
	if (rc)
1159
		return -1;
1160

1161
	for (; drconf_cell_cnt != 0; --drconf_cell_cnt) {
1162
		struct of_drconf_cell drmem;
1163

1164
		read_drconf_cell(&drmem, &dm);
1165

1166
		/* skip this block if it is reserved or not assigned to
1167
		 * this partition */
1168
		if ((drmem.flags & DRCONF_MEM_RESERVED)
1169
		    || !(drmem.flags & DRCONF_MEM_ASSIGNED))
1170
			continue;
1171

1172
		if ((scn_addr < drmem.base_addr)
1173
		    || (scn_addr >= (drmem.base_addr + lmb_size)))
1174
			continue;
1175

1176
		nid = of_drconf_to_nid_single(&drmem, &aa);
1177
		break;
1178
	}
1179

1180
	return nid;
1181
}
1182

1183
/*
1184
 * Find the node associated with a hot added memory section for memory
1185
 * represented in the device tree as a node (i.e. memory@XXXX) for
1186
 * each memblock.
1187
 */
1188
int hot_add_node_scn_to_nid(unsigned long scn_addr)
1189
{
1190
	struct device_node *memory = NULL;
1191
	int nid = -1;
1192

1193
	while ((memory = of_find_node_by_type(memory, "memory")) != NULL) {
1194
		unsigned long start, size;
1195
		int ranges;
1196
		const unsigned int *memcell_buf;
1197
		unsigned int len;
1198

1199
		memcell_buf = of_get_property(memory, "reg", &len);
1200
		if (!memcell_buf || len <= 0)
1201
			continue;
1202

1203
		/* ranges in cell */
1204
		ranges = (len >> 2) / (n_mem_addr_cells + n_mem_size_cells);
1205

1206
		while (ranges--) {
1207
			start = read_n_cells(n_mem_addr_cells, &memcell_buf);
1208
			size = read_n_cells(n_mem_size_cells, &memcell_buf);
1209

1210
			if ((scn_addr < start) || (scn_addr >= (start + size)))
1211
				continue;
1212

1213
			nid = of_node_to_nid_single(memory);
1214
			break;
1215
		}
1216

1217
		of_node_put(memory);
1218
		if (nid >= 0)
1219
			break;
1220
	}
1221

1222
	return nid;
1223
}
1224

1225
/*
1226
 * Find the node associated with a hot added memory section.  Section
1227
 * corresponds to a SPARSEMEM section, not an MEMBLOCK.  It is assumed that
1228
 * sections are fully contained within a single MEMBLOCK.
1229
 */
1230
int hot_add_scn_to_nid(unsigned long scn_addr)
1231
{
1232
	struct device_node *memory = NULL;
1233
	int nid, found = 0;
1234

1235
	if (!numa_enabled || (min_common_depth < 0))
1236
		return first_online_node;
1237

1238
	memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1239
	if (memory) {
1240
		nid = hot_add_drconf_scn_to_nid(memory, scn_addr);
1241
		of_node_put(memory);
1242
	} else {
1243
		nid = hot_add_node_scn_to_nid(scn_addr);
1244
	}
1245

1246
	if (nid < 0 || !node_online(nid))
1247
		nid = first_online_node;
1248

1249
	if (NODE_DATA(nid)->node_spanned_pages)
1250
		return nid;
1251

1252
	for_each_online_node(nid) {
1253
		if (NODE_DATA(nid)->node_spanned_pages) {
1254
			found = 1;
1255
			break;
1256
		}
1257
	}
1258

1259
	BUG_ON(!found);
1260
	return nid;
1261
}
1262

1263
static u64 hot_add_drconf_memory_max(void)
1264
{
1265
        struct device_node *memory = NULL;
1266
        unsigned int drconf_cell_cnt = 0;
1267
        u64 lmb_size = 0;
1268
        const u32 *dm = 0;
1269

1270
        memory = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
1271
        if (memory) {
1272
                drconf_cell_cnt = of_get_drconf_memory(memory, &dm);
1273
                lmb_size = of_get_lmb_size(memory);
1274
                of_node_put(memory);
1275
        }
1276
        return lmb_size * drconf_cell_cnt;
1277
}
1278

1279
/*
1280
 * memory_hotplug_max - return max address of memory that may be added
1281
 *
1282
 * This is currently only used on systems that support drconfig memory
1283
 * hotplug.
1284
 */
1285
u64 memory_hotplug_max(void)
1286
{
1287
        return max(hot_add_drconf_memory_max(), memblock_end_of_DRAM());
1288
}
1289
#endif /* CONFIG_MEMORY_HOTPLUG */
1290

1291
/* Virtual Processor Home Node (VPHN) support */
1292
#ifdef CONFIG_PPC_SPLPAR
1293
static u8 vphn_cpu_change_counts[NR_CPUS][MAX_DISTANCE_REF_POINTS];
1294
static cpumask_t cpu_associativity_changes_mask;
1295
static int vphn_enabled;
1296
static void set_topology_timer(void);
1297

1298
/*
1299
 * Store the current values of the associativity change counters in the
1300
 * hypervisor.
1301
 */
1302
static void setup_cpu_associativity_change_counters(void)
1303
{
1304
	int cpu;
1305

1306
	/* The VPHN feature supports a maximum of 8 reference points */
1307
	BUILD_BUG_ON(MAX_DISTANCE_REF_POINTS > 8);
1308

1309
	for_each_possible_cpu(cpu) {
1310
		int i;
1311
		u8 *counts = vphn_cpu_change_counts[cpu];
1312
		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1313

1314
		for (i = 0; i < distance_ref_points_depth; i++)
1315
			counts[i] = hypervisor_counts[i];
1316
	}
1317
}
1318

1319
/*
1320
 * The hypervisor maintains a set of 8 associativity change counters in
1321
 * the VPA of each cpu that correspond to the associativity levels in the
1322
 * ibm,associativity-reference-points property. When an associativity
1323
 * level changes, the corresponding counter is incremented.
1324
 *
1325
 * Set a bit in cpu_associativity_changes_mask for each cpu whose home
1326
 * node associativity levels have changed.
1327
 *
1328
 * Returns the number of cpus with unhandled associativity changes.
1329
 */
1330
static int update_cpu_associativity_changes_mask(void)
1331
{
1332
	int cpu, nr_cpus = 0;
1333
	cpumask_t *changes = &cpu_associativity_changes_mask;
1334

1335
	cpumask_clear(changes);
1336

1337
	for_each_possible_cpu(cpu) {
1338
		int i, changed = 0;
1339
		u8 *counts = vphn_cpu_change_counts[cpu];
1340
		volatile u8 *hypervisor_counts = lppaca[cpu].vphn_assoc_counts;
1341

1342
		for (i = 0; i < distance_ref_points_depth; i++) {
1343
			if (hypervisor_counts[i] != counts[i]) {
1344
				counts[i] = hypervisor_counts[i];
1345
				changed = 1;
1346
			}
1347
		}
1348
		if (changed) {
1349
			cpumask_set_cpu(cpu, changes);
1350
			nr_cpus++;
1351
		}
1352
	}
1353

1354
	return nr_cpus;
1355
}
1356

1357
/*
1358
 * 6 64-bit registers unpacked into 12 32-bit associativity values. To form
1359
 * the complete property we have to add the length in the first cell.
1360
 */
1361
#define VPHN_ASSOC_BUFSIZE (6*sizeof(u64)/sizeof(u32) + 1)
1362

1363
/*
1364
 * Convert the associativity domain numbers returned from the hypervisor
1365
 * to the sequence they would appear in the ibm,associativity property.
1366
 */
1367
static int vphn_unpack_associativity(const long *packed, unsigned int *unpacked)
1368
{
1369
	int i, nr_assoc_doms = 0;
1370
	const u16 *field = (const u16*) packed;
1371

1372
#define VPHN_FIELD_UNUSED	(0xffff)
1373
#define VPHN_FIELD_MSB		(0x8000)
1374
#define VPHN_FIELD_MASK		(~VPHN_FIELD_MSB)
1375

1376
	for (i = 1; i < VPHN_ASSOC_BUFSIZE; i++) {
1377
		if (*field == VPHN_FIELD_UNUSED) {
1378
			/* All significant fields processed, and remaining
1379
			 * fields contain the reserved value of all 1's.
1380
			 * Just store them.
1381
			 */
1382
			unpacked[i] = *((u32*)field);
1383
			field += 2;
1384
		} else if (*field & VPHN_FIELD_MSB) {
1385
			/* Data is in the lower 15 bits of this field */
1386
			unpacked[i] = *field & VPHN_FIELD_MASK;
1387
			field++;
1388
			nr_assoc_doms++;
1389
		} else {
1390
			/* Data is in the lower 15 bits of this field
1391
			 * concatenated with the next 16 bit field
1392
			 */
1393
			unpacked[i] = *((u32*)field);
1394
			field += 2;
1395
			nr_assoc_doms++;
1396
		}
1397
	}
1398

1399
	/* The first cell contains the length of the property */
1400
	unpacked[0] = nr_assoc_doms;
1401

1402
	return nr_assoc_doms;
1403
}
1404

1405
/*
1406
 * Retrieve the new associativity information for a virtual processor's
1407
 * home node.
1408
 */
1409
static long hcall_vphn(unsigned long cpu, unsigned int *associativity)
1410
{
1411
	long rc;
1412
	long retbuf[PLPAR_HCALL9_BUFSIZE] = {0};
1413
	u64 flags = 1;
1414
	int hwcpu = get_hard_smp_processor_id(cpu);
1415

1416
	rc = plpar_hcall9(H_HOME_NODE_ASSOCIATIVITY, retbuf, flags, hwcpu);
1417
	vphn_unpack_associativity(retbuf, associativity);
1418

1419
	return rc;
1420
}
1421

1422
static long vphn_get_associativity(unsigned long cpu,
1423
					unsigned int *associativity)
1424
{
1425
	long rc;
1426

1427
	rc = hcall_vphn(cpu, associativity);
1428

1429
	switch (rc) {
1430
	case H_FUNCTION:
1431
		printk(KERN_INFO
1432
			"VPHN is not supported. Disabling polling...\n");
1433
		stop_topology_update();
1434
		break;
1435
	case H_HARDWARE:
1436
		printk(KERN_ERR
1437
			"hcall_vphn() experienced a hardware fault "
1438
			"preventing VPHN. Disabling polling...\n");
1439
		stop_topology_update();
1440
	}
1441

1442
	return rc;
1443
}
1444

1445
/*
1446
 * Update the node maps and sysfs entries for each cpu whose home node
1447
 * has changed.
1448
 */
1449
int arch_update_cpu_topology(void)
1450
{
1451
	int cpu, nid, old_nid;
1452
	unsigned int associativity[VPHN_ASSOC_BUFSIZE] = {0};
1453
	struct sys_device *sysdev;
1454

1455
	for_each_cpu(cpu,&cpu_associativity_changes_mask) {
1456
		vphn_get_associativity(cpu, associativity);
1457
		nid = associativity_to_nid(associativity);
1458

1459
		if (nid < 0 || !node_online(nid))
1460
			nid = first_online_node;
1461

1462
		old_nid = numa_cpu_lookup_table[cpu];
1463

1464
		/* Disable hotplug while we update the cpu
1465
		 * masks and sysfs.
1466
		 */
1467
		get_online_cpus();
1468
		unregister_cpu_under_node(cpu, old_nid);
1469
		unmap_cpu_from_node(cpu);
1470
		map_cpu_to_node(cpu, nid);
1471
		register_cpu_under_node(cpu, nid);
1472
		put_online_cpus();
1473

1474
		sysdev = get_cpu_sysdev(cpu);
1475
		if (sysdev)
1476
			kobject_uevent(&sysdev->kobj, KOBJ_CHANGE);
1477
	}
1478

1479
	return 1;
1480
}
1481

1482
static void topology_work_fn(struct work_struct *work)
1483
{
1484
	rebuild_sched_domains();
1485
}
1486
static DECLARE_WORK(topology_work, topology_work_fn);
1487

1488
void topology_schedule_update(void)
1489
{
1490
	schedule_work(&topology_work);
1491
}
1492

1493
static void topology_timer_fn(unsigned long ignored)
1494
{
1495
	if (!vphn_enabled)
1496
		return;
1497
	if (update_cpu_associativity_changes_mask() > 0)
1498
		topology_schedule_update();
1499
	set_topology_timer();
1500
}
1501
static struct timer_list topology_timer =
1502
	TIMER_INITIALIZER(topology_timer_fn, 0, 0);
1503

1504
static void set_topology_timer(void)
1505
{
1506
	topology_timer.data = 0;
1507
	topology_timer.expires = jiffies + 60 * HZ;
1508
	add_timer(&topology_timer);
1509
}
1510

1511
/*
1512
 * Start polling for VPHN associativity changes.
1513
 */
1514
int start_topology_update(void)
1515
{
1516
	int rc = 0;
1517

1518
	/* Disabled until races with load balancing are fixed */
1519
	if (0 && firmware_has_feature(FW_FEATURE_VPHN) &&
1520
	    get_lppaca()->shared_proc) {
1521
		vphn_enabled = 1;
1522
		setup_cpu_associativity_change_counters();
1523
		init_timer_deferrable(&topology_timer);
1524
		set_topology_timer();
1525
		rc = 1;
1526
	}
1527

1528
	return rc;
1529
}
1530
__initcall(start_topology_update);
1531

1532
/*
1533
 * Disable polling for VPHN associativity changes.
1534
 */
1535
int stop_topology_update(void)
1536
{
1537
	vphn_enabled = 0;
1538
	return del_timer_sync(&topology_timer);
1539
}
1540
#endif /* CONFIG_PPC_SPLPAR */
1541

1542
Product

Resources

Company