Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
torvalds
GitHub Repository: torvalds/linux
Path: blob/master/arch/x86/mm/numa.c
26439 views
1
// SPDX-License-Identifier: GPL-2.0-only
2
/* Common code for 32 and 64-bit NUMA */
3
#include <linux/acpi.h>
4
#include <linux/kernel.h>
5
#include <linux/mm.h>
6
#include <linux/of.h>
7
#include <linux/string.h>
8
#include <linux/init.h>
9
#include <linux/memblock.h>
10
#include <linux/mmzone.h>
11
#include <linux/ctype.h>
12
#include <linux/nodemask.h>
13
#include <linux/sched.h>
14
#include <linux/topology.h>
15
#include <linux/sort.h>
16
#include <linux/numa_memblks.h>
17
18
#include <asm/e820/api.h>
19
#include <asm/proto.h>
20
#include <asm/dma.h>
21
#include <asm/numa.h>
22
#include <asm/amd/nb.h>
23
24
#include "mm_internal.h"
25
26
int numa_off;
27
28
static __init int numa_setup(char *opt)
29
{
30
if (!opt)
31
return -EINVAL;
32
if (!strncmp(opt, "off", 3))
33
numa_off = 1;
34
if (!strncmp(opt, "fake=", 5))
35
return numa_emu_cmdline(opt + 5);
36
if (!strncmp(opt, "noacpi", 6))
37
disable_srat();
38
if (!strncmp(opt, "nohmat", 6))
39
disable_hmat();
40
return 0;
41
}
42
early_param("numa", numa_setup);
43
44
/*
45
* apicid, cpu, node mappings
46
*/
47
s16 __apicid_to_node[MAX_LOCAL_APIC] = {
48
[0 ... MAX_LOCAL_APIC-1] = NUMA_NO_NODE
49
};
50
51
int numa_cpu_node(int cpu)
52
{
53
u32 apicid = early_per_cpu(x86_cpu_to_apicid, cpu);
54
55
if (apicid != BAD_APICID)
56
return __apicid_to_node[apicid];
57
return NUMA_NO_NODE;
58
}
59
60
cpumask_var_t node_to_cpumask_map[MAX_NUMNODES];
61
EXPORT_SYMBOL(node_to_cpumask_map);
62
63
/*
64
* Map cpu index to node index
65
*/
66
DEFINE_EARLY_PER_CPU(int, x86_cpu_to_node_map, NUMA_NO_NODE);
67
EXPORT_EARLY_PER_CPU_SYMBOL(x86_cpu_to_node_map);
68
69
void numa_set_node(int cpu, int node)
70
{
71
int *cpu_to_node_map = early_per_cpu_ptr(x86_cpu_to_node_map);
72
73
/* early setting, no percpu area yet */
74
if (cpu_to_node_map) {
75
cpu_to_node_map[cpu] = node;
76
return;
77
}
78
79
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
80
if (cpu >= nr_cpu_ids || !cpu_possible(cpu)) {
81
printk(KERN_ERR "numa_set_node: invalid cpu# (%d)\n", cpu);
82
dump_stack();
83
return;
84
}
85
#endif
86
per_cpu(x86_cpu_to_node_map, cpu) = node;
87
88
set_cpu_numa_node(cpu, node);
89
}
90
91
void numa_clear_node(int cpu)
92
{
93
numa_set_node(cpu, NUMA_NO_NODE);
94
}
95
96
/*
97
* Allocate node_to_cpumask_map based on number of available nodes
98
* Requires node_possible_map to be valid.
99
*
100
* Note: cpumask_of_node() is not valid until after this is done.
101
* (Use CONFIG_DEBUG_PER_CPU_MAPS to check this.)
102
*/
103
void __init setup_node_to_cpumask_map(void)
104
{
105
unsigned int node;
106
107
/* setup nr_node_ids if not done yet */
108
if (nr_node_ids == MAX_NUMNODES)
109
setup_nr_node_ids();
110
111
/* allocate the map */
112
for (node = 0; node < nr_node_ids; node++)
113
alloc_bootmem_cpumask_var(&node_to_cpumask_map[node]);
114
115
/* cpumask_of_node() will now work */
116
pr_debug("Node to cpumask map for %u nodes\n", nr_node_ids);
117
}
118
119
static int __init numa_register_nodes(void)
120
{
121
int nid;
122
123
if (!memblock_validate_numa_coverage(SZ_1M))
124
return -EINVAL;
125
126
/* Finally register nodes. */
127
for_each_node_mask(nid, node_possible_map) {
128
unsigned long start_pfn, end_pfn;
129
130
/*
131
* Note, get_pfn_range_for_nid() depends on
132
* memblock_set_node() having already happened
133
*/
134
get_pfn_range_for_nid(nid, &start_pfn, &end_pfn);
135
if (start_pfn >= end_pfn)
136
continue;
137
138
alloc_node_data(nid);
139
node_set_online(nid);
140
}
141
142
/* Dump memblock with node info and return. */
143
memblock_dump_all();
144
return 0;
145
}
146
147
/*
148
* There are unfortunately some poorly designed mainboards around that
149
* only connect memory to a single CPU. This breaks the 1:1 cpu->node
150
* mapping. To avoid this fill in the mapping for all possible CPUs,
151
* as the number of CPUs is not known yet. We round robin the existing
152
* nodes.
153
*/
154
static void __init numa_init_array(void)
155
{
156
int rr, i;
157
158
rr = first_node(node_online_map);
159
for (i = 0; i < nr_cpu_ids; i++) {
160
if (early_cpu_to_node(i) != NUMA_NO_NODE)
161
continue;
162
numa_set_node(i, rr);
163
rr = next_node_in(rr, node_online_map);
164
}
165
}
166
167
static int __init numa_init(int (*init_func)(void))
168
{
169
int i;
170
int ret;
171
172
for (i = 0; i < MAX_LOCAL_APIC; i++)
173
set_apicid_to_node(i, NUMA_NO_NODE);
174
175
ret = numa_memblks_init(init_func, /* memblock_force_top_down */ true);
176
if (ret < 0)
177
return ret;
178
179
ret = numa_register_nodes();
180
if (ret < 0)
181
return ret;
182
183
for (i = 0; i < nr_cpu_ids; i++) {
184
int nid = early_cpu_to_node(i);
185
186
if (nid == NUMA_NO_NODE)
187
continue;
188
if (!node_online(nid))
189
numa_clear_node(i);
190
}
191
numa_init_array();
192
193
return 0;
194
}
195
196
/**
197
* dummy_numa_init - Fallback dummy NUMA init
198
*
199
* Used if there's no underlying NUMA architecture, NUMA initialization
200
* fails, or NUMA is disabled on the command line.
201
*
202
* Must online at least one node and add memory blocks that cover all
203
* allowed memory. This function must not fail.
204
*/
205
static int __init dummy_numa_init(void)
206
{
207
printk(KERN_INFO "%s\n",
208
numa_off ? "NUMA turned off" : "No NUMA configuration found");
209
printk(KERN_INFO "Faking a node at [mem %#018Lx-%#018Lx]\n",
210
0LLU, PFN_PHYS(max_pfn) - 1);
211
212
node_set(0, numa_nodes_parsed);
213
numa_add_memblk(0, 0, PFN_PHYS(max_pfn));
214
215
return 0;
216
}
217
218
/**
219
* x86_numa_init - Initialize NUMA
220
*
221
* Try each configured NUMA initialization method until one succeeds. The
222
* last fallback is dummy single node config encompassing whole memory and
223
* never fails.
224
*/
225
void __init x86_numa_init(void)
226
{
227
if (!numa_off) {
228
#ifdef CONFIG_ACPI_NUMA
229
if (!numa_init(x86_acpi_numa_init))
230
return;
231
#endif
232
#ifdef CONFIG_AMD_NUMA
233
if (!numa_init(amd_numa_init))
234
return;
235
#endif
236
if (acpi_disabled && !numa_init(of_numa_init))
237
return;
238
}
239
240
numa_init(dummy_numa_init);
241
}
242
243
244
/*
245
* A node may exist which has one or more Generic Initiators but no CPUs and no
246
* memory.
247
*
248
* This function must be called after init_cpu_to_node(), to ensure that any
249
* memoryless CPU nodes have already been brought online, and before the
250
* node_data[nid] is needed for zone list setup in build_all_zonelists().
251
*
252
* When this function is called, any nodes containing either memory and/or CPUs
253
* will already be online and there is no need to do anything extra, even if
254
* they also contain one or more Generic Initiators.
255
*/
256
void __init init_gi_nodes(void)
257
{
258
int nid;
259
260
/*
261
* Exclude this node from
262
* bringup_nonboot_cpus
263
* cpu_up
264
* __try_online_node
265
* register_one_node
266
* because node_subsys is not initialized yet.
267
* TODO remove dependency on node_online
268
*/
269
for_each_node_state(nid, N_GENERIC_INITIATOR)
270
if (!node_online(nid))
271
node_set_online(nid);
272
}
273
274
/*
275
* Setup early cpu_to_node.
276
*
277
* Populate cpu_to_node[] only if x86_cpu_to_apicid[],
278
* and apicid_to_node[] tables have valid entries for a CPU.
279
* This means we skip cpu_to_node[] initialisation for NUMA
280
* emulation and faking node case (when running a kernel compiled
281
* for NUMA on a non NUMA box), which is OK as cpu_to_node[]
282
* is already initialized in a round robin manner at numa_init_array,
283
* prior to this call, and this initialization is good enough
284
* for the fake NUMA cases.
285
*
286
* Called before the per_cpu areas are setup.
287
*/
288
void __init init_cpu_to_node(void)
289
{
290
int cpu;
291
u32 *cpu_to_apicid = early_per_cpu_ptr(x86_cpu_to_apicid);
292
293
BUG_ON(cpu_to_apicid == NULL);
294
295
for_each_possible_cpu(cpu) {
296
int node = numa_cpu_node(cpu);
297
298
if (node == NUMA_NO_NODE)
299
continue;
300
301
/*
302
* Exclude this node from
303
* bringup_nonboot_cpus
304
* cpu_up
305
* __try_online_node
306
* register_one_node
307
* because node_subsys is not initialized yet.
308
* TODO remove dependency on node_online
309
*/
310
if (!node_online(node))
311
node_set_online(node);
312
313
numa_set_node(cpu, node);
314
}
315
}
316
317
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
318
319
# ifndef CONFIG_NUMA_EMU
320
void numa_add_cpu(unsigned int cpu)
321
{
322
cpumask_set_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
323
}
324
325
void numa_remove_cpu(unsigned int cpu)
326
{
327
cpumask_clear_cpu(cpu, node_to_cpumask_map[early_cpu_to_node(cpu)]);
328
}
329
# endif /* !CONFIG_NUMA_EMU */
330
331
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
332
333
int __cpu_to_node(int cpu)
334
{
335
if (early_per_cpu_ptr(x86_cpu_to_node_map)) {
336
printk(KERN_WARNING
337
"cpu_to_node(%d): usage too early!\n", cpu);
338
dump_stack();
339
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
340
}
341
return per_cpu(x86_cpu_to_node_map, cpu);
342
}
343
EXPORT_SYMBOL(__cpu_to_node);
344
345
/*
346
* Same function as cpu_to_node() but used if called before the
347
* per_cpu areas are setup.
348
*/
349
int early_cpu_to_node(int cpu)
350
{
351
if (early_per_cpu_ptr(x86_cpu_to_node_map))
352
return early_per_cpu_ptr(x86_cpu_to_node_map)[cpu];
353
354
if (!cpu_possible(cpu)) {
355
printk(KERN_WARNING
356
"early_cpu_to_node(%d): no per_cpu area!\n", cpu);
357
dump_stack();
358
return NUMA_NO_NODE;
359
}
360
return per_cpu(x86_cpu_to_node_map, cpu);
361
}
362
363
void debug_cpumask_set_cpu(unsigned int cpu, int node, bool enable)
364
{
365
struct cpumask *mask;
366
367
if (node == NUMA_NO_NODE) {
368
/* early_cpu_to_node() already emits a warning and trace */
369
return;
370
}
371
mask = node_to_cpumask_map[node];
372
if (!cpumask_available(mask)) {
373
pr_err("node_to_cpumask_map[%i] NULL\n", node);
374
dump_stack();
375
return;
376
}
377
378
if (enable)
379
cpumask_set_cpu(cpu, mask);
380
else
381
cpumask_clear_cpu(cpu, mask);
382
383
printk(KERN_DEBUG "%s cpu %d node %d: mask now %*pbl\n",
384
enable ? "numa_add_cpu" : "numa_remove_cpu",
385
cpu, node, cpumask_pr_args(mask));
386
return;
387
}
388
389
# ifndef CONFIG_NUMA_EMU
390
static void numa_set_cpumask(int cpu, bool enable)
391
{
392
debug_cpumask_set_cpu(cpu, early_cpu_to_node(cpu), enable);
393
}
394
395
void numa_add_cpu(unsigned int cpu)
396
{
397
numa_set_cpumask(cpu, true);
398
}
399
400
void numa_remove_cpu(unsigned int cpu)
401
{
402
numa_set_cpumask(cpu, false);
403
}
404
# endif /* !CONFIG_NUMA_EMU */
405
406
/*
407
* Returns a pointer to the bitmask of CPUs on Node 'node'.
408
*/
409
const struct cpumask *cpumask_of_node(int node)
410
{
411
if ((unsigned)node >= nr_node_ids) {
412
printk(KERN_WARNING
413
"cpumask_of_node(%d): (unsigned)node >= nr_node_ids(%u)\n",
414
node, nr_node_ids);
415
dump_stack();
416
return cpu_none_mask;
417
}
418
if (!cpumask_available(node_to_cpumask_map[node])) {
419
printk(KERN_WARNING
420
"cpumask_of_node(%d): no node_to_cpumask_map!\n",
421
node);
422
dump_stack();
423
return cpu_online_mask;
424
}
425
return node_to_cpumask_map[node];
426
}
427
EXPORT_SYMBOL(cpumask_of_node);
428
429
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
430
431
#ifdef CONFIG_NUMA_EMU
432
void __init numa_emu_update_cpu_to_node(int *emu_nid_to_phys,
433
unsigned int nr_emu_nids)
434
{
435
int i, j;
436
437
/*
438
* Transform __apicid_to_node table to use emulated nids by
439
* reverse-mapping phys_nid. The maps should always exist but fall
440
* back to zero just in case.
441
*/
442
for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
443
if (__apicid_to_node[i] == NUMA_NO_NODE)
444
continue;
445
for (j = 0; j < nr_emu_nids; j++)
446
if (__apicid_to_node[i] == emu_nid_to_phys[j])
447
break;
448
__apicid_to_node[i] = j < nr_emu_nids ? j : 0;
449
}
450
}
451
452
u64 __init numa_emu_dma_end(void)
453
{
454
return PFN_PHYS(MAX_DMA32_PFN);
455
}
456
#endif /* CONFIG_NUMA_EMU */
457
458