Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/mm/numa_emulation.c
10817 views
1
/*
2
* NUMA emulation
3
*/
4
#include <linux/kernel.h>
5
#include <linux/errno.h>
6
#include <linux/topology.h>
7
#include <linux/memblock.h>
8
#include <linux/bootmem.h>
9
#include <asm/dma.h>
10
11
#include "numa_internal.h"
12
13
static int emu_nid_to_phys[MAX_NUMNODES] __cpuinitdata;
14
static char *emu_cmdline __initdata;
15
16
void __init numa_emu_cmdline(char *str)
17
{
18
emu_cmdline = str;
19
}
20
21
static int __init emu_find_memblk_by_nid(int nid, const struct numa_meminfo *mi)
22
{
23
int i;
24
25
for (i = 0; i < mi->nr_blks; i++)
26
if (mi->blk[i].nid == nid)
27
return i;
28
return -ENOENT;
29
}
30
31
/*
32
* Sets up nid to range from @start to @end. The return value is -errno if
33
* something went wrong, 0 otherwise.
34
*/
35
static int __init emu_setup_memblk(struct numa_meminfo *ei,
36
struct numa_meminfo *pi,
37
int nid, int phys_blk, u64 size)
38
{
39
struct numa_memblk *eb = &ei->blk[ei->nr_blks];
40
struct numa_memblk *pb = &pi->blk[phys_blk];
41
42
if (ei->nr_blks >= NR_NODE_MEMBLKS) {
43
pr_err("NUMA: Too many emulated memblks, failing emulation\n");
44
return -EINVAL;
45
}
46
47
ei->nr_blks++;
48
eb->start = pb->start;
49
eb->end = pb->start + size;
50
eb->nid = nid;
51
52
if (emu_nid_to_phys[nid] == NUMA_NO_NODE)
53
emu_nid_to_phys[nid] = pb->nid;
54
55
pb->start += size;
56
if (pb->start >= pb->end) {
57
WARN_ON_ONCE(pb->start > pb->end);
58
numa_remove_memblk_from(phys_blk, pi);
59
}
60
61
printk(KERN_INFO "Faking node %d at %016Lx-%016Lx (%LuMB)\n", nid,
62
eb->start, eb->end, (eb->end - eb->start) >> 20);
63
return 0;
64
}
65
66
/*
67
* Sets up nr_nodes fake nodes interleaved over physical nodes ranging from addr
68
* to max_addr. The return value is the number of nodes allocated.
69
*/
70
static int __init split_nodes_interleave(struct numa_meminfo *ei,
71
struct numa_meminfo *pi,
72
u64 addr, u64 max_addr, int nr_nodes)
73
{
74
nodemask_t physnode_mask = NODE_MASK_NONE;
75
u64 size;
76
int big;
77
int nid = 0;
78
int i, ret;
79
80
if (nr_nodes <= 0)
81
return -1;
82
if (nr_nodes > MAX_NUMNODES) {
83
pr_info("numa=fake=%d too large, reducing to %d\n",
84
nr_nodes, MAX_NUMNODES);
85
nr_nodes = MAX_NUMNODES;
86
}
87
88
/*
89
* Calculate target node size. x86_32 freaks on __udivdi3() so do
90
* the division in ulong number of pages and convert back.
91
*/
92
size = max_addr - addr - memblock_x86_hole_size(addr, max_addr);
93
size = PFN_PHYS((unsigned long)(size >> PAGE_SHIFT) / nr_nodes);
94
95
/*
96
* Calculate the number of big nodes that can be allocated as a result
97
* of consolidating the remainder.
98
*/
99
big = ((size & ~FAKE_NODE_MIN_HASH_MASK) * nr_nodes) /
100
FAKE_NODE_MIN_SIZE;
101
102
size &= FAKE_NODE_MIN_HASH_MASK;
103
if (!size) {
104
pr_err("Not enough memory for each node. "
105
"NUMA emulation disabled.\n");
106
return -1;
107
}
108
109
for (i = 0; i < pi->nr_blks; i++)
110
node_set(pi->blk[i].nid, physnode_mask);
111
112
/*
113
* Continue to fill physical nodes with fake nodes until there is no
114
* memory left on any of them.
115
*/
116
while (nodes_weight(physnode_mask)) {
117
for_each_node_mask(i, physnode_mask) {
118
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
119
u64 start, limit, end;
120
int phys_blk;
121
122
phys_blk = emu_find_memblk_by_nid(i, pi);
123
if (phys_blk < 0) {
124
node_clear(i, physnode_mask);
125
continue;
126
}
127
start = pi->blk[phys_blk].start;
128
limit = pi->blk[phys_blk].end;
129
end = start + size;
130
131
if (nid < big)
132
end += FAKE_NODE_MIN_SIZE;
133
134
/*
135
* Continue to add memory to this fake node if its
136
* non-reserved memory is less than the per-node size.
137
*/
138
while (end - start -
139
memblock_x86_hole_size(start, end) < size) {
140
end += FAKE_NODE_MIN_SIZE;
141
if (end > limit) {
142
end = limit;
143
break;
144
}
145
}
146
147
/*
148
* If there won't be at least FAKE_NODE_MIN_SIZE of
149
* non-reserved memory in ZONE_DMA32 for the next node,
150
* this one must extend to the boundary.
151
*/
152
if (end < dma32_end && dma32_end - end -
153
memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
154
end = dma32_end;
155
156
/*
157
* If there won't be enough non-reserved memory for the
158
* next node, this one must extend to the end of the
159
* physical node.
160
*/
161
if (limit - end -
162
memblock_x86_hole_size(end, limit) < size)
163
end = limit;
164
165
ret = emu_setup_memblk(ei, pi, nid++ % nr_nodes,
166
phys_blk,
167
min(end, limit) - start);
168
if (ret < 0)
169
return ret;
170
}
171
}
172
return 0;
173
}
174
175
/*
176
* Returns the end address of a node so that there is at least `size' amount of
177
* non-reserved memory or `max_addr' is reached.
178
*/
179
static u64 __init find_end_of_node(u64 start, u64 max_addr, u64 size)
180
{
181
u64 end = start + size;
182
183
while (end - start - memblock_x86_hole_size(start, end) < size) {
184
end += FAKE_NODE_MIN_SIZE;
185
if (end > max_addr) {
186
end = max_addr;
187
break;
188
}
189
}
190
return end;
191
}
192
193
/*
194
* Sets up fake nodes of `size' interleaved over physical nodes ranging from
195
* `addr' to `max_addr'. The return value is the number of nodes allocated.
196
*/
197
static int __init split_nodes_size_interleave(struct numa_meminfo *ei,
198
struct numa_meminfo *pi,
199
u64 addr, u64 max_addr, u64 size)
200
{
201
nodemask_t physnode_mask = NODE_MASK_NONE;
202
u64 min_size;
203
int nid = 0;
204
int i, ret;
205
206
if (!size)
207
return -1;
208
/*
209
* The limit on emulated nodes is MAX_NUMNODES, so the size per node is
210
* increased accordingly if the requested size is too small. This
211
* creates a uniform distribution of node sizes across the entire
212
* machine (but not necessarily over physical nodes).
213
*/
214
min_size = (max_addr - addr - memblock_x86_hole_size(addr, max_addr)) /
215
MAX_NUMNODES;
216
min_size = max(min_size, FAKE_NODE_MIN_SIZE);
217
if ((min_size & FAKE_NODE_MIN_HASH_MASK) < min_size)
218
min_size = (min_size + FAKE_NODE_MIN_SIZE) &
219
FAKE_NODE_MIN_HASH_MASK;
220
if (size < min_size) {
221
pr_err("Fake node size %LuMB too small, increasing to %LuMB\n",
222
size >> 20, min_size >> 20);
223
size = min_size;
224
}
225
size &= FAKE_NODE_MIN_HASH_MASK;
226
227
for (i = 0; i < pi->nr_blks; i++)
228
node_set(pi->blk[i].nid, physnode_mask);
229
230
/*
231
* Fill physical nodes with fake nodes of size until there is no memory
232
* left on any of them.
233
*/
234
while (nodes_weight(physnode_mask)) {
235
for_each_node_mask(i, physnode_mask) {
236
u64 dma32_end = PFN_PHYS(MAX_DMA32_PFN);
237
u64 start, limit, end;
238
int phys_blk;
239
240
phys_blk = emu_find_memblk_by_nid(i, pi);
241
if (phys_blk < 0) {
242
node_clear(i, physnode_mask);
243
continue;
244
}
245
start = pi->blk[phys_blk].start;
246
limit = pi->blk[phys_blk].end;
247
248
end = find_end_of_node(start, limit, size);
249
/*
250
* If there won't be at least FAKE_NODE_MIN_SIZE of
251
* non-reserved memory in ZONE_DMA32 for the next node,
252
* this one must extend to the boundary.
253
*/
254
if (end < dma32_end && dma32_end - end -
255
memblock_x86_hole_size(end, dma32_end) < FAKE_NODE_MIN_SIZE)
256
end = dma32_end;
257
258
/*
259
* If there won't be enough non-reserved memory for the
260
* next node, this one must extend to the end of the
261
* physical node.
262
*/
263
if (limit - end -
264
memblock_x86_hole_size(end, limit) < size)
265
end = limit;
266
267
ret = emu_setup_memblk(ei, pi, nid++ % MAX_NUMNODES,
268
phys_blk,
269
min(end, limit) - start);
270
if (ret < 0)
271
return ret;
272
}
273
}
274
return 0;
275
}
276
277
/**
278
* numa_emulation - Emulate NUMA nodes
279
* @numa_meminfo: NUMA configuration to massage
280
* @numa_dist_cnt: The size of the physical NUMA distance table
281
*
282
* Emulate NUMA nodes according to the numa=fake kernel parameter.
283
* @numa_meminfo contains the physical memory configuration and is modified
284
* to reflect the emulated configuration on success. @numa_dist_cnt is
285
* used to determine the size of the physical distance table.
286
*
287
* On success, the following modifications are made.
288
*
289
* - @numa_meminfo is updated to reflect the emulated nodes.
290
*
291
* - __apicid_to_node[] is updated such that APIC IDs are mapped to the
292
* emulated nodes.
293
*
294
* - NUMA distance table is rebuilt to represent distances between emulated
295
* nodes. The distances are determined considering how emulated nodes
296
* are mapped to physical nodes and match the actual distances.
297
*
298
* - emu_nid_to_phys[] reflects how emulated nodes are mapped to physical
299
* nodes. This is used by numa_add_cpu() and numa_remove_cpu().
300
*
301
* If emulation is not enabled or fails, emu_nid_to_phys[] is filled with
302
* identity mapping and no other modification is made.
303
*/
304
void __init numa_emulation(struct numa_meminfo *numa_meminfo, int numa_dist_cnt)
305
{
306
static struct numa_meminfo ei __initdata;
307
static struct numa_meminfo pi __initdata;
308
const u64 max_addr = PFN_PHYS(max_pfn);
309
u8 *phys_dist = NULL;
310
size_t phys_size = numa_dist_cnt * numa_dist_cnt * sizeof(phys_dist[0]);
311
int max_emu_nid, dfl_phys_nid;
312
int i, j, ret;
313
314
if (!emu_cmdline)
315
goto no_emu;
316
317
memset(&ei, 0, sizeof(ei));
318
pi = *numa_meminfo;
319
320
for (i = 0; i < MAX_NUMNODES; i++)
321
emu_nid_to_phys[i] = NUMA_NO_NODE;
322
323
/*
324
* If the numa=fake command-line contains a 'M' or 'G', it represents
325
* the fixed node size. Otherwise, if it is just a single number N,
326
* split the system RAM into N fake nodes.
327
*/
328
if (strchr(emu_cmdline, 'M') || strchr(emu_cmdline, 'G')) {
329
u64 size;
330
331
size = memparse(emu_cmdline, &emu_cmdline);
332
ret = split_nodes_size_interleave(&ei, &pi, 0, max_addr, size);
333
} else {
334
unsigned long n;
335
336
n = simple_strtoul(emu_cmdline, NULL, 0);
337
ret = split_nodes_interleave(&ei, &pi, 0, max_addr, n);
338
}
339
340
if (ret < 0)
341
goto no_emu;
342
343
if (numa_cleanup_meminfo(&ei) < 0) {
344
pr_warning("NUMA: Warning: constructed meminfo invalid, disabling emulation\n");
345
goto no_emu;
346
}
347
348
/* copy the physical distance table */
349
if (numa_dist_cnt) {
350
u64 phys;
351
352
phys = memblock_find_in_range(0, PFN_PHYS(max_pfn_mapped),
353
phys_size, PAGE_SIZE);
354
if (phys == MEMBLOCK_ERROR) {
355
pr_warning("NUMA: Warning: can't allocate copy of distance table, disabling emulation\n");
356
goto no_emu;
357
}
358
memblock_x86_reserve_range(phys, phys + phys_size, "TMP NUMA DIST");
359
phys_dist = __va(phys);
360
361
for (i = 0; i < numa_dist_cnt; i++)
362
for (j = 0; j < numa_dist_cnt; j++)
363
phys_dist[i * numa_dist_cnt + j] =
364
node_distance(i, j);
365
}
366
367
/*
368
* Determine the max emulated nid and the default phys nid to use
369
* for unmapped nodes.
370
*/
371
max_emu_nid = 0;
372
dfl_phys_nid = NUMA_NO_NODE;
373
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++) {
374
if (emu_nid_to_phys[i] != NUMA_NO_NODE) {
375
max_emu_nid = i;
376
if (dfl_phys_nid == NUMA_NO_NODE)
377
dfl_phys_nid = emu_nid_to_phys[i];
378
}
379
}
380
if (dfl_phys_nid == NUMA_NO_NODE) {
381
pr_warning("NUMA: Warning: can't determine default physical node, disabling emulation\n");
382
goto no_emu;
383
}
384
385
/* commit */
386
*numa_meminfo = ei;
387
388
/*
389
* Transform __apicid_to_node table to use emulated nids by
390
* reverse-mapping phys_nid. The maps should always exist but fall
391
* back to zero just in case.
392
*/
393
for (i = 0; i < ARRAY_SIZE(__apicid_to_node); i++) {
394
if (__apicid_to_node[i] == NUMA_NO_NODE)
395
continue;
396
for (j = 0; j < ARRAY_SIZE(emu_nid_to_phys); j++)
397
if (__apicid_to_node[i] == emu_nid_to_phys[j])
398
break;
399
__apicid_to_node[i] = j < ARRAY_SIZE(emu_nid_to_phys) ? j : 0;
400
}
401
402
/* make sure all emulated nodes are mapped to a physical node */
403
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
404
if (emu_nid_to_phys[i] == NUMA_NO_NODE)
405
emu_nid_to_phys[i] = dfl_phys_nid;
406
407
/* transform distance table */
408
numa_reset_distance();
409
for (i = 0; i < max_emu_nid + 1; i++) {
410
for (j = 0; j < max_emu_nid + 1; j++) {
411
int physi = emu_nid_to_phys[i];
412
int physj = emu_nid_to_phys[j];
413
int dist;
414
415
if (physi >= numa_dist_cnt || physj >= numa_dist_cnt)
416
dist = physi == physj ?
417
LOCAL_DISTANCE : REMOTE_DISTANCE;
418
else
419
dist = phys_dist[physi * numa_dist_cnt + physj];
420
421
numa_set_distance(i, j, dist);
422
}
423
}
424
425
/* free the copied physical distance table */
426
if (phys_dist)
427
memblock_x86_free_range(__pa(phys_dist), __pa(phys_dist) + phys_size);
428
return;
429
430
no_emu:
431
/* No emulation. Build identity emu_nid_to_phys[] for numa_add_cpu() */
432
for (i = 0; i < ARRAY_SIZE(emu_nid_to_phys); i++)
433
emu_nid_to_phys[i] = i;
434
}
435
436
#ifndef CONFIG_DEBUG_PER_CPU_MAPS
437
void __cpuinit numa_add_cpu(int cpu)
438
{
439
int physnid, nid;
440
441
nid = early_cpu_to_node(cpu);
442
BUG_ON(nid == NUMA_NO_NODE || !node_online(nid));
443
444
physnid = emu_nid_to_phys[nid];
445
446
/*
447
* Map the cpu to each emulated node that is allocated on the physical
448
* node of the cpu's apic id.
449
*/
450
for_each_online_node(nid)
451
if (emu_nid_to_phys[nid] == physnid)
452
cpumask_set_cpu(cpu, node_to_cpumask_map[nid]);
453
}
454
455
void __cpuinit numa_remove_cpu(int cpu)
456
{
457
int i;
458
459
for_each_online_node(i)
460
cpumask_clear_cpu(cpu, node_to_cpumask_map[i]);
461
}
462
#else /* !CONFIG_DEBUG_PER_CPU_MAPS */
463
static void __cpuinit numa_set_cpumask(int cpu, bool enable)
464
{
465
int nid, physnid;
466
467
nid = early_cpu_to_node(cpu);
468
if (nid == NUMA_NO_NODE) {
469
/* early_cpu_to_node() already emits a warning and trace */
470
return;
471
}
472
473
physnid = emu_nid_to_phys[nid];
474
475
for_each_online_node(nid) {
476
if (emu_nid_to_phys[nid] != physnid)
477
continue;
478
479
debug_cpumask_set_cpu(cpu, nid, enable);
480
}
481
}
482
483
void __cpuinit numa_add_cpu(int cpu)
484
{
485
numa_set_cpumask(cpu, true);
486
}
487
488
void __cpuinit numa_remove_cpu(int cpu)
489
{
490
numa_set_cpumask(cpu, false);
491
}
492
#endif /* !CONFIG_DEBUG_PER_CPU_MAPS */
493
494