CoCalc -- numa.c

GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/mm/numa.c
²⁶⁴³⁹ views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Common code for 32 and 64-bit NUMA */
3
#include <linux/acpi.h>
4
#include <linux/kernel.h>
5
#include <linux/mm.h>
6
#include <linux/of.h>
7
#include <linux/string.h>
8
#include <linux/init.h>
9
#include <linux/memblock.h>
10
#include <linux/mmzone.h>
11
#include <linux/ctype.h>
12
#include <linux/nodemask.h>
13
#include <linux/sched.h>
14
#include <linux/topology.h>
15
#include <linux/sort.h>
16
#include <linux/numa_memblks.h>
17

18
#include <asm/e820/api.h>
19
#include <asm/proto.h>
20
#include <asm/dma.h>
21
#include <asm/numa.h>
22
#include <asm/amd/nb.h>
23

24
#include "mm_internal.h"
25

26
int numa_off;
27

28
static __init int numa_setup(char *opt)
29
{
30
	if (!opt)
31
		return -EINVAL;
32
	if (!strncmp(opt, "off", 3))
33
		numa_off = 1;
34
	if (!strncmp(opt, "fake=", 5))
35
		return numa_emu_cmdline(opt + 5);
36
	if (!strncmp(opt, "noacpi", 6))
37
		disable_srat();
38
	if (!strncmp(opt, "nohmat", 6))
39
		disable_hmat();
40
	return 0;
41
}
42
early_param("numa", numa_setup);
43

44
/*
45
 * apicid, cpu, node mappings
46
 */
47
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48
	[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
49
};
50

51
int numa_cpu_node(int cpu)
52
{
53
	u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
54

55
	if (apicid != BAD_APICID)
56
		return __apicid_to_node[apicid];
57
	return NUMA_NO_NODE;
58
}
59

60
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
61
EXPORT_SYMBOL(node_to_cpumask_map);
62

63
/*
64
 * Map cpu index to node index
65
 */
66
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
67
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
68

69
void numa_set_node(int cpu, int node)
70
{
71
	int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
72

73
	/* early setting, no percpu area yet */
74
	if (cpu_to_node_map) {
75
		cpu_to_node_map[cpu] = node;
76
		return;
77
	}
78

79
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
80
	if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
81
		printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
82
		dump_stack();
83
		return;
84
	}
85
#endif
86
	per_cpu(x86_cpu_to_node_map, cpu) = node;
87

88
	set_cpu_numa_node(cpu, node);
89
}
90

91
void numa_clear_node(int cpu)
92
{
93
	numa_set_node(cpu, NUMA_NO_NODE);
94
}
95

96
/*
97
 * Allocate node_to_cpumask_map based on number of available nodes
98
 * Requires node_possible_map to be valid.
99
 *
100
 * Note: cpumask_of_node() is not valid until after this is done.
101
 * (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
102
 */
103
void __init setup_node_to_cpumask_map(void)
104
{
105
	unsigned int node;
106

107
	/* setup nr_node_ids if not done yet */
108
	if (nr_node_ids == MAX_NUMNODES)
109
		setup_nr_node_ids();
110

111
	/* allocate the map */
112
	for (node = 0; node < nr_node_ids; node++)
113
		alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
114

115
	/* cpumask_of_node() will now work */
116
	pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
117
}
118

119
static int __init numa_register_nodes(void)
120
{
121
	int nid;
122

123
	if (!memblock_validate_numa_coverage(SZ_1M))
124
		return -EINVAL;
125

126
	/* Finally register nodes. */
127
	for_each_node_mask(nid, node_possible_map) {
128
		unsigned long start_pfn, end_pfn;
129

130
		/*
131
		 * Note, get_pfn_range_for_nid() depends on
132
		 * memblock_set_node() having already happened
133
		 */
134
		get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
135
		if (start_pfn >= end_pfn)
136
			continue;
137

138
		alloc_node_data(nid);
139
		node_set_online(nid);
140
	}
141

142
	/* Dump memblock with node info and return. */
143
	memblock_dump_all();
144
	return 0;
145
}
146

147
/*
148
 * There are unfortunately some poorly designed mainboards around that
149
 * only connect memory to a single CPU. This breaks the 1:1 cpu->node
150
 * mapping. To avoid this fill in the mapping for all possible CPUs,
151
 * as the number of CPUs is not known yet. We round robin the existing
152
 * nodes.
153
 */
154
static void __init numa_init_array(void)
155
{
156
	int rr, i;
157

158
	rr = first_node(node_online_map);
159
	for (i = 0; i < nr_cpu_ids; i++) {
160
		if (early_cpu_to_node(i) != NUMA_NO_NODE)
161
			continue;
162
		numa_set_node(i, rr);
163
		rr = next_node_in(rr, node_online_map);
164
	}
165
}
166

167
static int __init numa_init(int (*init_func)(void))
168
{
169
	int i;
170
	int ret;
171

172
	for (i = 0; i < MAX_LOCAL_APIC; i++)
173
		set_apicid_to_node(i, NUMA_NO_NODE);
174

175
	ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
176
	if (ret < 0)
177
		return ret;
178

179
	ret = numa_register_nodes();
180
	if (ret < 0)
181
		return ret;
182

183
	for (i = 0; i < nr_cpu_ids; i++) {
184
		int nid = early_cpu_to_node(i);
185

186
		if (nid == NUMA_NO_NODE)
187
			continue;
188
		if (!node_online(nid))
189
			numa_clear_node(i);
190
	}
191
	numa_init_array();
192

193
	return 0;
194
}
195

196
/**
197
 * dummy_numa_init - Fallback dummy NUMA init
198
 *
199
 * Used if there's no underlying NUMA architecture, NUMA initialization
200
 * fails, or NUMA is disabled on the command line.
201
 *
202
 * Must online at least one node and add memory blocks that cover all
203
 * allowed memory.  This function must not fail.
204
 */
205
static int __init dummy_numa_init(void)
206
{
207
	printk(KERN_INFO "%s\n",
208
	       numa_off ? "NUMA turned off" : "No NUMA configuration found");
209
	printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
210
	       0LLU, PFN_PHYS(max_pfn) - 1);
211

212
	node_set(0, numa_nodes_parsed);
213
	numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
214

215
	return 0;
216
}
217

218
/**
219
 * x86_numa_init - Initialize NUMA
220
 *
221
 * Try each configured NUMA initialization method until one succeeds.  The
222
 * last fallback is dummy single node config encompassing whole memory and
223
 * never fails.
224
 */
225
void __init x86_numa_init(void)
226
{
227
	if (!numa_off) {
228
#ifdef CONFIG_ACPI_NUMA
229
		if (!numa_init(x86_acpi_numa_init))
230
			return;
231
#endif
232
#ifdef CONFIG_AMD_NUMA
233
		if (!numa_init(amd_numa_init))
234
			return;
235
#endif
236
		if (acpi_disabled && !numa_init(of_numa_init))
237
			return;
238
	}
239

240
	numa_init(dummy_numa_init);
241
}
242

243

244
/*
245
 * A node may exist which has one or more Generic Initiators but no CPUs and no
246
 * memory.
247
 *
248
 * This function must be called after init_cpu_to_node(), to ensure that any
249
 * memoryless CPU nodes have already been brought online, and before the
250
 * node_data[nid] is needed for zone list setup in build_all_zonelists().
251
 *
252
 * When this function is called, any nodes containing either memory and/or CPUs
253
 * will already be online and there is no need to do anything extra, even if
254
 * they also contain one or more Generic Initiators.
255
 */
256
void __init init_gi_nodes(void)
257
{
258
	int nid;
259

260
	/*
261
	 * Exclude this node from
262
	 * bringup_nonboot_cpus
263
	 *  cpu_up
264
	 *   __try_online_node
265
	 *    register_one_node
266
	 * because node_subsys is not initialized yet.
267
	 * TODO remove dependency on node_online
268
	 */
269
	for_each_node_state(nid, N_GENERIC_INITIATOR)
270
		if (!node_online(nid))
271
			node_set_online(nid);
272
}
273

274
/*
275
 * Setup early cpu_to_node.
276
 *
277
 * Populate cpu_to_node[] only if x86_cpu_to_apicid[],
278
 * and apicid_to_node[] tables have valid entries for a CPU.
279
 * This means we skip cpu_to_node[] initialisation for NUMA
280
 * emulation and faking node case (when running a kernel compiled
281
 * for NUMA on a non NUMA box), which is OK as cpu_to_node[]
282
 * is already initialized in a round robin manner at numa_init_array,
283
 * prior to this call, and this initialization is good enough
284
 * for the fake NUMA cases.
285
 *
286
 * Called before the per_cpu areas are setup.
287
 */
288
void __init init_cpu_to_node(void)
289
{
290
	int cpu;
291
	u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
292

293
	BUG_ON(cpu_to_apicid == NULL);
294

295
	for_each_possible_cpu(cpu) {
296
		int node = numa_cpu_node(cpu);
297

298
		if (node == NUMA_NO_NODE)
299
			continue;
300

301
		/*
302
		 * Exclude this node from
303
		 * bringup_nonboot_cpus
304
		 *  cpu_up
305
		 *   __try_online_node
306
		 *    register_one_node
307
		 * because node_subsys is not initialized yet.
308
		 * TODO remove dependency on node_online
309
		 */
310
		if (!node_online(node))
311
			node_set_online(node);
312

313
		numa_set_node(cpu, node);
314
	}
315
}
316

317
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
318

319
# ifndef CONFIG_NUMA_EMU
320
void numa_add_cpu(unsigned int cpu)
321
{
322
	cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
323
}
324

325
void numa_remove_cpu(unsigned int cpu)
326
{
327
	cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
328
}
329
# endif	/* !CONFIG_NUMA_EMU */
330

331
#else	/* !CONFIG_DEBUG_PER_CPU_MAPS */
332

333
int __cpu_to_node(int cpu)
334
{
335
	if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
336
		printk(KERN_WARNING
337
			"cpu_to_node(%d): usage too early!\n", cpu);
338
		dump_stack();
339
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
340
	}
341
	return per_cpu(x86_cpu_to_node_map, cpu);
342
}
343
EXPORT_SYMBOL(__cpu_to_node);
344

345
/*
346
 * Same function as cpu_to_node() but used if called before the
347
 * per_cpu areas are setup.
348
 */
349
int early_cpu_to_node(int cpu)
350
{
351
	if (early_per_cpu_ptr(x86_cpu_to_node_map))
352
		return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
353

354
	if (!cpu_possible(cpu)) {
355
		printk(KERN_WARNING
356
			"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
357
		dump_stack();
358
		return NUMA_NO_NODE;
359
	}
360
	return per_cpu(x86_cpu_to_node_map, cpu);
361
}
362

363
void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
364
{
365
	struct cpumask *mask;
366

367
	if (node == NUMA_NO_NODE) {
368
		/* early_cpu_to_node() already emits a warning and trace */
369
		return;
370
	}
371
	mask = node_to_cpumask_map[node];
372
	if (!cpumask_available(mask)) {
373
		pr_err("node_to_cpumask_map[%i] NULL\n", node);
374
		dump_stack();
375
		return;
376
	}
377

378
	if (enable)
379
		cpumask_set_cpu(cpu, mask);
380
	else
381
		cpumask_clear_cpu(cpu, mask);
382

383
	printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
384
		enable ? "numa_add_cpu" : "numa_remove_cpu",
385
		cpu, node, cpumask_pr_args(mask));
386
	return;
387
}
388

389
# ifndef CONFIG_NUMA_EMU
390
static void numa_set_cpumask(int cpu, bool enable)
391
{
392
	debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
393
}
394

395
void numa_add_cpu(unsigned int cpu)
396
{
397
	numa_set_cpumask(cpu, true);
398
}
399

400
void numa_remove_cpu(unsigned int cpu)
401
{
402
	numa_set_cpumask(cpu, false);
403
}
404
# endif	/* !CONFIG_NUMA_EMU */
405

406
/*
407
 * Returns a pointer to the bitmask of CPUs on Node 'node'.
408
 */
409
const struct cpumask *cpumask_of_node(int node)
410
{
411
	if ((unsigned)node >= nr_node_ids) {
412
		printk(KERN_WARNING
413
			"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
414
			node, nr_node_ids);
415
		dump_stack();
416
		return cpu_none_mask;
417
	}
418
	if (!cpumask_available(node_to_cpumask_map[node])) {
419
		printk(KERN_WARNING
420
			"cpumask_of_node(%d): no node_to_cpumask_map!\n",
421
			node);
422
		dump_stack();
423
		return cpu_online_mask;
424
	}
425
	return node_to_cpumask_map[node];
426
}
427
EXPORT_SYMBOL(cpumask_of_node);
428

429
#endif	/* !CONFIG_DEBUG_PER_CPU_MAPS */
430

431
#ifdef CONFIG_NUMA_EMU
432
void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
433
					unsigned int nr_emu_nids)
434
{
435
	int i, j;
436

437
	/*
438
	 * Transform __apicid_to_node table to use emulated nids by
439
	 * reverse-mapping phys_nid.  The maps should always exist but fall
440
	 * back to zero just in case.
441
	 */
442
	for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
443
		if (__apicid_to_node[i] == NUMA_NO_NODE)
444
			continue;
445
		for (j = 0; j < nr_emu_nids; j++)
446
			if (__apicid_to_node[i] == emu_nid_to_phys[j])
447
				break;
448
		__apicid_to_node[i] = j < nr_emu_nids ? j : 0;
449
	}
450
}
451

452
u64 __init numa_emu_dma_end(void)
453
{
454
	return PFN_PHYS(MAX_DMA32_PFN);
455
}
456
#endif /* CONFIG_NUMA_EMU */
457

458
Product

Resources

Company