Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
awilliam
GitHub Repository: awilliam/linux-vfio
Path: blob/master/arch/x86/platform/uv/tlb_uv.c
10819 views
1
/*
2
* SGI UltraViolet TLB flush routines.
3
*
4
* (c) 2008-2011 Cliff Wickman <[email protected]>, SGI.
5
*
6
* This code is released under the GNU General Public License version 2 or
7
* later.
8
*/
9
#include <linux/seq_file.h>
10
#include <linux/proc_fs.h>
11
#include <linux/debugfs.h>
12
#include <linux/kernel.h>
13
#include <linux/slab.h>
14
#include <linux/delay.h>
15
16
#include <asm/mmu_context.h>
17
#include <asm/uv/uv.h>
18
#include <asm/uv/uv_mmrs.h>
19
#include <asm/uv/uv_hub.h>
20
#include <asm/uv/uv_bau.h>
21
#include <asm/apic.h>
22
#include <asm/idle.h>
23
#include <asm/tsc.h>
24
#include <asm/irq_vectors.h>
25
#include <asm/timer.h>
26
27
/* timeouts in nanoseconds (indexed by UVH_AGING_PRESCALE_SEL urgency7 30:28) */
28
static int timeout_base_ns[] = {
29
20,
30
160,
31
1280,
32
10240,
33
81920,
34
655360,
35
5242880,
36
167772160
37
};
38
39
static int timeout_us;
40
static int nobau;
41
static int baudisabled;
42
static spinlock_t disable_lock;
43
static cycles_t congested_cycles;
44
45
/* tunables: */
46
static int max_concurr = MAX_BAU_CONCURRENT;
47
static int max_concurr_const = MAX_BAU_CONCURRENT;
48
static int plugged_delay = PLUGGED_DELAY;
49
static int plugsb4reset = PLUGSB4RESET;
50
static int timeoutsb4reset = TIMEOUTSB4RESET;
51
static int ipi_reset_limit = IPI_RESET_LIMIT;
52
static int complete_threshold = COMPLETE_THRESHOLD;
53
static int congested_respns_us = CONGESTED_RESPONSE_US;
54
static int congested_reps = CONGESTED_REPS;
55
static int congested_period = CONGESTED_PERIOD;
56
57
static struct tunables tunables[] = {
58
{&max_concurr, MAX_BAU_CONCURRENT}, /* must be [0] */
59
{&plugged_delay, PLUGGED_DELAY},
60
{&plugsb4reset, PLUGSB4RESET},
61
{&timeoutsb4reset, TIMEOUTSB4RESET},
62
{&ipi_reset_limit, IPI_RESET_LIMIT},
63
{&complete_threshold, COMPLETE_THRESHOLD},
64
{&congested_respns_us, CONGESTED_RESPONSE_US},
65
{&congested_reps, CONGESTED_REPS},
66
{&congested_period, CONGESTED_PERIOD}
67
};
68
69
static struct dentry *tunables_dir;
70
static struct dentry *tunables_file;
71
72
/* these correspond to the statistics printed by ptc_seq_show() */
73
static char *stat_description[] = {
74
"sent: number of shootdown messages sent",
75
"stime: time spent sending messages",
76
"numuvhubs: number of hubs targeted with shootdown",
77
"numuvhubs16: number times 16 or more hubs targeted",
78
"numuvhubs8: number times 8 or more hubs targeted",
79
"numuvhubs4: number times 4 or more hubs targeted",
80
"numuvhubs2: number times 2 or more hubs targeted",
81
"numuvhubs1: number times 1 hub targeted",
82
"numcpus: number of cpus targeted with shootdown",
83
"dto: number of destination timeouts",
84
"retries: destination timeout retries sent",
85
"rok: : destination timeouts successfully retried",
86
"resetp: ipi-style resource resets for plugs",
87
"resett: ipi-style resource resets for timeouts",
88
"giveup: fall-backs to ipi-style shootdowns",
89
"sto: number of source timeouts",
90
"bz: number of stay-busy's",
91
"throt: number times spun in throttle",
92
"swack: image of UVH_LB_BAU_INTD_SOFTWARE_ACKNOWLEDGE",
93
"recv: shootdown messages received",
94
"rtime: time spent processing messages",
95
"all: shootdown all-tlb messages",
96
"one: shootdown one-tlb messages",
97
"mult: interrupts that found multiple messages",
98
"none: interrupts that found no messages",
99
"retry: number of retry messages processed",
100
"canc: number messages canceled by retries",
101
"nocan: number retries that found nothing to cancel",
102
"reset: number of ipi-style reset requests processed",
103
"rcan: number messages canceled by reset requests",
104
"disable: number times use of the BAU was disabled",
105
"enable: number times use of the BAU was re-enabled"
106
};
107
108
static int __init
109
setup_nobau(char *arg)
110
{
111
nobau = 1;
112
return 0;
113
}
114
early_param("nobau", setup_nobau);
115
116
/* base pnode in this partition */
117
static int uv_base_pnode __read_mostly;
118
/* position of pnode (which is nasid>>1): */
119
static int uv_nshift __read_mostly;
120
static unsigned long uv_mmask __read_mostly;
121
122
static DEFINE_PER_CPU(struct ptc_stats, ptcstats);
123
static DEFINE_PER_CPU(struct bau_control, bau_control);
124
static DEFINE_PER_CPU(cpumask_var_t, uv_flush_tlb_mask);
125
126
/*
127
* Determine the first node on a uvhub. 'Nodes' are used for kernel
128
* memory allocation.
129
*/
130
static int __init uvhub_to_first_node(int uvhub)
131
{
132
int node, b;
133
134
for_each_online_node(node) {
135
b = uv_node_to_blade_id(node);
136
if (uvhub == b)
137
return node;
138
}
139
return -1;
140
}
141
142
/*
143
* Determine the apicid of the first cpu on a uvhub.
144
*/
145
static int __init uvhub_to_first_apicid(int uvhub)
146
{
147
int cpu;
148
149
for_each_present_cpu(cpu)
150
if (uvhub == uv_cpu_to_blade_id(cpu))
151
return per_cpu(x86_cpu_to_apicid, cpu);
152
return -1;
153
}
154
155
/*
156
* Free a software acknowledge hardware resource by clearing its Pending
157
* bit. This will return a reply to the sender.
158
* If the message has timed out, a reply has already been sent by the
159
* hardware but the resource has not been released. In that case our
160
* clear of the Timeout bit (as well) will free the resource. No reply will
161
* be sent (the hardware will only do one reply per message).
162
*/
163
static void reply_to_message(struct msg_desc *mdp, struct bau_control *bcp)
164
{
165
unsigned long dw;
166
struct bau_pq_entry *msg;
167
168
msg = mdp->msg;
169
if (!msg->canceled) {
170
dw = (msg->swack_vec << UV_SW_ACK_NPENDING) | msg->swack_vec;
171
write_mmr_sw_ack(dw);
172
}
173
msg->replied_to = 1;
174
msg->swack_vec = 0;
175
}
176
177
/*
178
* Process the receipt of a RETRY message
179
*/
180
static void bau_process_retry_msg(struct msg_desc *mdp,
181
struct bau_control *bcp)
182
{
183
int i;
184
int cancel_count = 0;
185
unsigned long msg_res;
186
unsigned long mmr = 0;
187
struct bau_pq_entry *msg = mdp->msg;
188
struct bau_pq_entry *msg2;
189
struct ptc_stats *stat = bcp->statp;
190
191
stat->d_retries++;
192
/*
193
* cancel any message from msg+1 to the retry itself
194
*/
195
for (msg2 = msg+1, i = 0; i < DEST_Q_SIZE; msg2++, i++) {
196
if (msg2 > mdp->queue_last)
197
msg2 = mdp->queue_first;
198
if (msg2 == msg)
199
break;
200
201
/* same conditions for cancellation as do_reset */
202
if ((msg2->replied_to == 0) && (msg2->canceled == 0) &&
203
(msg2->swack_vec) && ((msg2->swack_vec &
204
msg->swack_vec) == 0) &&
205
(msg2->sending_cpu == msg->sending_cpu) &&
206
(msg2->msg_type != MSG_NOOP)) {
207
mmr = read_mmr_sw_ack();
208
msg_res = msg2->swack_vec;
209
/*
210
* This is a message retry; clear the resources held
211
* by the previous message only if they timed out.
212
* If it has not timed out we have an unexpected
213
* situation to report.
214
*/
215
if (mmr & (msg_res << UV_SW_ACK_NPENDING)) {
216
unsigned long mr;
217
/*
218
* is the resource timed out?
219
* make everyone ignore the cancelled message.
220
*/
221
msg2->canceled = 1;
222
stat->d_canceled++;
223
cancel_count++;
224
mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
225
write_mmr_sw_ack(mr);
226
}
227
}
228
}
229
if (!cancel_count)
230
stat->d_nocanceled++;
231
}
232
233
/*
234
* Do all the things a cpu should do for a TLB shootdown message.
235
* Other cpu's may come here at the same time for this message.
236
*/
237
static void bau_process_message(struct msg_desc *mdp,
238
struct bau_control *bcp)
239
{
240
short socket_ack_count = 0;
241
short *sp;
242
struct atomic_short *asp;
243
struct ptc_stats *stat = bcp->statp;
244
struct bau_pq_entry *msg = mdp->msg;
245
struct bau_control *smaster = bcp->socket_master;
246
247
/*
248
* This must be a normal message, or retry of a normal message
249
*/
250
if (msg->address == TLB_FLUSH_ALL) {
251
local_flush_tlb();
252
stat->d_alltlb++;
253
} else {
254
__flush_tlb_one(msg->address);
255
stat->d_onetlb++;
256
}
257
stat->d_requestee++;
258
259
/*
260
* One cpu on each uvhub has the additional job on a RETRY
261
* of releasing the resource held by the message that is
262
* being retried. That message is identified by sending
263
* cpu number.
264
*/
265
if (msg->msg_type == MSG_RETRY && bcp == bcp->uvhub_master)
266
bau_process_retry_msg(mdp, bcp);
267
268
/*
269
* This is a swack message, so we have to reply to it.
270
* Count each responding cpu on the socket. This avoids
271
* pinging the count's cache line back and forth between
272
* the sockets.
273
*/
274
sp = &smaster->socket_acknowledge_count[mdp->msg_slot];
275
asp = (struct atomic_short *)sp;
276
socket_ack_count = atom_asr(1, asp);
277
if (socket_ack_count == bcp->cpus_in_socket) {
278
int msg_ack_count;
279
/*
280
* Both sockets dump their completed count total into
281
* the message's count.
282
*/
283
smaster->socket_acknowledge_count[mdp->msg_slot] = 0;
284
asp = (struct atomic_short *)&msg->acknowledge_count;
285
msg_ack_count = atom_asr(socket_ack_count, asp);
286
287
if (msg_ack_count == bcp->cpus_in_uvhub) {
288
/*
289
* All cpus in uvhub saw it; reply
290
*/
291
reply_to_message(mdp, bcp);
292
}
293
}
294
295
return;
296
}
297
298
/*
299
* Determine the first cpu on a uvhub.
300
*/
301
static int uvhub_to_first_cpu(int uvhub)
302
{
303
int cpu;
304
for_each_present_cpu(cpu)
305
if (uvhub == uv_cpu_to_blade_id(cpu))
306
return cpu;
307
return -1;
308
}
309
310
/*
311
* Last resort when we get a large number of destination timeouts is
312
* to clear resources held by a given cpu.
313
* Do this with IPI so that all messages in the BAU message queue
314
* can be identified by their nonzero swack_vec field.
315
*
316
* This is entered for a single cpu on the uvhub.
317
* The sender want's this uvhub to free a specific message's
318
* swack resources.
319
*/
320
static void do_reset(void *ptr)
321
{
322
int i;
323
struct bau_control *bcp = &per_cpu(bau_control, smp_processor_id());
324
struct reset_args *rap = (struct reset_args *)ptr;
325
struct bau_pq_entry *msg;
326
struct ptc_stats *stat = bcp->statp;
327
328
stat->d_resets++;
329
/*
330
* We're looking for the given sender, and
331
* will free its swack resource.
332
* If all cpu's finally responded after the timeout, its
333
* message 'replied_to' was set.
334
*/
335
for (msg = bcp->queue_first, i = 0; i < DEST_Q_SIZE; msg++, i++) {
336
unsigned long msg_res;
337
/* do_reset: same conditions for cancellation as
338
bau_process_retry_msg() */
339
if ((msg->replied_to == 0) &&
340
(msg->canceled == 0) &&
341
(msg->sending_cpu == rap->sender) &&
342
(msg->swack_vec) &&
343
(msg->msg_type != MSG_NOOP)) {
344
unsigned long mmr;
345
unsigned long mr;
346
/*
347
* make everyone else ignore this message
348
*/
349
msg->canceled = 1;
350
/*
351
* only reset the resource if it is still pending
352
*/
353
mmr = read_mmr_sw_ack();
354
msg_res = msg->swack_vec;
355
mr = (msg_res << UV_SW_ACK_NPENDING) | msg_res;
356
if (mmr & msg_res) {
357
stat->d_rcanceled++;
358
write_mmr_sw_ack(mr);
359
}
360
}
361
}
362
return;
363
}
364
365
/*
366
* Use IPI to get all target uvhubs to release resources held by
367
* a given sending cpu number.
368
*/
369
static void reset_with_ipi(struct bau_targ_hubmask *distribution, int sender)
370
{
371
int uvhub;
372
int maskbits;
373
cpumask_t mask;
374
struct reset_args reset_args;
375
376
reset_args.sender = sender;
377
cpus_clear(mask);
378
/* find a single cpu for each uvhub in this distribution mask */
379
maskbits = sizeof(struct bau_targ_hubmask) * BITSPERBYTE;
380
for (uvhub = 0; uvhub < maskbits; uvhub++) {
381
int cpu;
382
if (!bau_uvhub_isset(uvhub, distribution))
383
continue;
384
/* find a cpu for this uvhub */
385
cpu = uvhub_to_first_cpu(uvhub);
386
cpu_set(cpu, mask);
387
}
388
389
/* IPI all cpus; preemption is already disabled */
390
smp_call_function_many(&mask, do_reset, (void *)&reset_args, 1);
391
return;
392
}
393
394
static inline unsigned long cycles_2_us(unsigned long long cyc)
395
{
396
unsigned long long ns;
397
unsigned long us;
398
int cpu = smp_processor_id();
399
400
ns = (cyc * per_cpu(cyc2ns, cpu)) >> CYC2NS_SCALE_FACTOR;
401
us = ns / 1000;
402
return us;
403
}
404
405
/*
406
* wait for all cpus on this hub to finish their sends and go quiet
407
* leaves uvhub_quiesce set so that no new broadcasts are started by
408
* bau_flush_send_and_wait()
409
*/
410
static inline void quiesce_local_uvhub(struct bau_control *hmaster)
411
{
412
atom_asr(1, (struct atomic_short *)&hmaster->uvhub_quiesce);
413
}
414
415
/*
416
* mark this quiet-requestor as done
417
*/
418
static inline void end_uvhub_quiesce(struct bau_control *hmaster)
419
{
420
atom_asr(-1, (struct atomic_short *)&hmaster->uvhub_quiesce);
421
}
422
423
static unsigned long uv1_read_status(unsigned long mmr_offset, int right_shift)
424
{
425
unsigned long descriptor_status;
426
427
descriptor_status = uv_read_local_mmr(mmr_offset);
428
descriptor_status >>= right_shift;
429
descriptor_status &= UV_ACT_STATUS_MASK;
430
return descriptor_status;
431
}
432
433
/*
434
* Wait for completion of a broadcast software ack message
435
* return COMPLETE, RETRY(PLUGGED or TIMEOUT) or GIVEUP
436
*/
437
static int uv1_wait_completion(struct bau_desc *bau_desc,
438
unsigned long mmr_offset, int right_shift,
439
struct bau_control *bcp, long try)
440
{
441
unsigned long descriptor_status;
442
cycles_t ttm;
443
struct ptc_stats *stat = bcp->statp;
444
445
descriptor_status = uv1_read_status(mmr_offset, right_shift);
446
/* spin on the status MMR, waiting for it to go idle */
447
while ((descriptor_status != DS_IDLE)) {
448
/*
449
* Our software ack messages may be blocked because
450
* there are no swack resources available. As long
451
* as none of them has timed out hardware will NACK
452
* our message and its state will stay IDLE.
453
*/
454
if (descriptor_status == DS_SOURCE_TIMEOUT) {
455
stat->s_stimeout++;
456
return FLUSH_GIVEUP;
457
} else if (descriptor_status == DS_DESTINATION_TIMEOUT) {
458
stat->s_dtimeout++;
459
ttm = get_cycles();
460
461
/*
462
* Our retries may be blocked by all destination
463
* swack resources being consumed, and a timeout
464
* pending. In that case hardware returns the
465
* ERROR that looks like a destination timeout.
466
*/
467
if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
468
bcp->conseccompletes = 0;
469
return FLUSH_RETRY_PLUGGED;
470
}
471
472
bcp->conseccompletes = 0;
473
return FLUSH_RETRY_TIMEOUT;
474
} else {
475
/*
476
* descriptor_status is still BUSY
477
*/
478
cpu_relax();
479
}
480
descriptor_status = uv1_read_status(mmr_offset, right_shift);
481
}
482
bcp->conseccompletes++;
483
return FLUSH_COMPLETE;
484
}
485
486
/*
487
* UV2 has an extra bit of status in the ACTIVATION_STATUS_2 register.
488
*/
489
static unsigned long uv2_read_status(unsigned long offset, int rshft, int cpu)
490
{
491
unsigned long descriptor_status;
492
unsigned long descriptor_status2;
493
494
descriptor_status = ((read_lmmr(offset) >> rshft) & UV_ACT_STATUS_MASK);
495
descriptor_status2 = (read_mmr_uv2_status() >> cpu) & 0x1UL;
496
descriptor_status = (descriptor_status << 1) | descriptor_status2;
497
return descriptor_status;
498
}
499
500
static int uv2_wait_completion(struct bau_desc *bau_desc,
501
unsigned long mmr_offset, int right_shift,
502
struct bau_control *bcp, long try)
503
{
504
unsigned long descriptor_stat;
505
cycles_t ttm;
506
int cpu = bcp->uvhub_cpu;
507
struct ptc_stats *stat = bcp->statp;
508
509
descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
510
511
/* spin on the status MMR, waiting for it to go idle */
512
while (descriptor_stat != UV2H_DESC_IDLE) {
513
/*
514
* Our software ack messages may be blocked because
515
* there are no swack resources available. As long
516
* as none of them has timed out hardware will NACK
517
* our message and its state will stay IDLE.
518
*/
519
if ((descriptor_stat == UV2H_DESC_SOURCE_TIMEOUT) ||
520
(descriptor_stat == UV2H_DESC_DEST_STRONG_NACK) ||
521
(descriptor_stat == UV2H_DESC_DEST_PUT_ERR)) {
522
stat->s_stimeout++;
523
return FLUSH_GIVEUP;
524
} else if (descriptor_stat == UV2H_DESC_DEST_TIMEOUT) {
525
stat->s_dtimeout++;
526
ttm = get_cycles();
527
/*
528
* Our retries may be blocked by all destination
529
* swack resources being consumed, and a timeout
530
* pending. In that case hardware returns the
531
* ERROR that looks like a destination timeout.
532
*/
533
if (cycles_2_us(ttm - bcp->send_message) < timeout_us) {
534
bcp->conseccompletes = 0;
535
return FLUSH_RETRY_PLUGGED;
536
}
537
bcp->conseccompletes = 0;
538
return FLUSH_RETRY_TIMEOUT;
539
} else {
540
/*
541
* descriptor_stat is still BUSY
542
*/
543
cpu_relax();
544
}
545
descriptor_stat = uv2_read_status(mmr_offset, right_shift, cpu);
546
}
547
bcp->conseccompletes++;
548
return FLUSH_COMPLETE;
549
}
550
551
/*
552
* There are 2 status registers; each and array[32] of 2 bits. Set up for
553
* which register to read and position in that register based on cpu in
554
* current hub.
555
*/
556
static int wait_completion(struct bau_desc *bau_desc,
557
struct bau_control *bcp, long try)
558
{
559
int right_shift;
560
unsigned long mmr_offset;
561
int cpu = bcp->uvhub_cpu;
562
563
if (cpu < UV_CPUS_PER_AS) {
564
mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_0;
565
right_shift = cpu * UV_ACT_STATUS_SIZE;
566
} else {
567
mmr_offset = UVH_LB_BAU_SB_ACTIVATION_STATUS_1;
568
right_shift = ((cpu - UV_CPUS_PER_AS) * UV_ACT_STATUS_SIZE);
569
}
570
571
if (is_uv1_hub())
572
return uv1_wait_completion(bau_desc, mmr_offset, right_shift,
573
bcp, try);
574
else
575
return uv2_wait_completion(bau_desc, mmr_offset, right_shift,
576
bcp, try);
577
}
578
579
static inline cycles_t sec_2_cycles(unsigned long sec)
580
{
581
unsigned long ns;
582
cycles_t cyc;
583
584
ns = sec * 1000000000;
585
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
586
return cyc;
587
}
588
589
/*
590
* Our retries are blocked by all destination sw ack resources being
591
* in use, and a timeout is pending. In that case hardware immediately
592
* returns the ERROR that looks like a destination timeout.
593
*/
594
static void destination_plugged(struct bau_desc *bau_desc,
595
struct bau_control *bcp,
596
struct bau_control *hmaster, struct ptc_stats *stat)
597
{
598
udelay(bcp->plugged_delay);
599
bcp->plugged_tries++;
600
601
if (bcp->plugged_tries >= bcp->plugsb4reset) {
602
bcp->plugged_tries = 0;
603
604
quiesce_local_uvhub(hmaster);
605
606
spin_lock(&hmaster->queue_lock);
607
reset_with_ipi(&bau_desc->distribution, bcp->cpu);
608
spin_unlock(&hmaster->queue_lock);
609
610
end_uvhub_quiesce(hmaster);
611
612
bcp->ipi_attempts++;
613
stat->s_resets_plug++;
614
}
615
}
616
617
static void destination_timeout(struct bau_desc *bau_desc,
618
struct bau_control *bcp, struct bau_control *hmaster,
619
struct ptc_stats *stat)
620
{
621
hmaster->max_concurr = 1;
622
bcp->timeout_tries++;
623
if (bcp->timeout_tries >= bcp->timeoutsb4reset) {
624
bcp->timeout_tries = 0;
625
626
quiesce_local_uvhub(hmaster);
627
628
spin_lock(&hmaster->queue_lock);
629
reset_with_ipi(&bau_desc->distribution, bcp->cpu);
630
spin_unlock(&hmaster->queue_lock);
631
632
end_uvhub_quiesce(hmaster);
633
634
bcp->ipi_attempts++;
635
stat->s_resets_timeout++;
636
}
637
}
638
639
/*
640
* Completions are taking a very long time due to a congested numalink
641
* network.
642
*/
643
static void disable_for_congestion(struct bau_control *bcp,
644
struct ptc_stats *stat)
645
{
646
/* let only one cpu do this disabling */
647
spin_lock(&disable_lock);
648
649
if (!baudisabled && bcp->period_requests &&
650
((bcp->period_time / bcp->period_requests) > congested_cycles)) {
651
int tcpu;
652
struct bau_control *tbcp;
653
/* it becomes this cpu's job to turn on the use of the
654
BAU again */
655
baudisabled = 1;
656
bcp->set_bau_off = 1;
657
bcp->set_bau_on_time = get_cycles();
658
bcp->set_bau_on_time += sec_2_cycles(bcp->cong_period);
659
stat->s_bau_disabled++;
660
for_each_present_cpu(tcpu) {
661
tbcp = &per_cpu(bau_control, tcpu);
662
tbcp->baudisabled = 1;
663
}
664
}
665
666
spin_unlock(&disable_lock);
667
}
668
669
static void count_max_concurr(int stat, struct bau_control *bcp,
670
struct bau_control *hmaster)
671
{
672
bcp->plugged_tries = 0;
673
bcp->timeout_tries = 0;
674
if (stat != FLUSH_COMPLETE)
675
return;
676
if (bcp->conseccompletes <= bcp->complete_threshold)
677
return;
678
if (hmaster->max_concurr >= hmaster->max_concurr_const)
679
return;
680
hmaster->max_concurr++;
681
}
682
683
static void record_send_stats(cycles_t time1, cycles_t time2,
684
struct bau_control *bcp, struct ptc_stats *stat,
685
int completion_status, int try)
686
{
687
cycles_t elapsed;
688
689
if (time2 > time1) {
690
elapsed = time2 - time1;
691
stat->s_time += elapsed;
692
693
if ((completion_status == FLUSH_COMPLETE) && (try == 1)) {
694
bcp->period_requests++;
695
bcp->period_time += elapsed;
696
if ((elapsed > congested_cycles) &&
697
(bcp->period_requests > bcp->cong_reps))
698
disable_for_congestion(bcp, stat);
699
}
700
} else
701
stat->s_requestor--;
702
703
if (completion_status == FLUSH_COMPLETE && try > 1)
704
stat->s_retriesok++;
705
else if (completion_status == FLUSH_GIVEUP)
706
stat->s_giveup++;
707
}
708
709
/*
710
* Because of a uv1 hardware bug only a limited number of concurrent
711
* requests can be made.
712
*/
713
static void uv1_throttle(struct bau_control *hmaster, struct ptc_stats *stat)
714
{
715
spinlock_t *lock = &hmaster->uvhub_lock;
716
atomic_t *v;
717
718
v = &hmaster->active_descriptor_count;
719
if (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr)) {
720
stat->s_throttles++;
721
do {
722
cpu_relax();
723
} while (!atomic_inc_unless_ge(lock, v, hmaster->max_concurr));
724
}
725
}
726
727
/*
728
* Handle the completion status of a message send.
729
*/
730
static void handle_cmplt(int completion_status, struct bau_desc *bau_desc,
731
struct bau_control *bcp, struct bau_control *hmaster,
732
struct ptc_stats *stat)
733
{
734
if (completion_status == FLUSH_RETRY_PLUGGED)
735
destination_plugged(bau_desc, bcp, hmaster, stat);
736
else if (completion_status == FLUSH_RETRY_TIMEOUT)
737
destination_timeout(bau_desc, bcp, hmaster, stat);
738
}
739
740
/*
741
* Send a broadcast and wait for it to complete.
742
*
743
* The flush_mask contains the cpus the broadcast is to be sent to including
744
* cpus that are on the local uvhub.
745
*
746
* Returns 0 if all flushing represented in the mask was done.
747
* Returns 1 if it gives up entirely and the original cpu mask is to be
748
* returned to the kernel.
749
*/
750
int uv_flush_send_and_wait(struct bau_desc *bau_desc,
751
struct cpumask *flush_mask, struct bau_control *bcp)
752
{
753
int seq_number = 0;
754
int completion_stat = 0;
755
long try = 0;
756
unsigned long index;
757
cycles_t time1;
758
cycles_t time2;
759
struct ptc_stats *stat = bcp->statp;
760
struct bau_control *hmaster = bcp->uvhub_master;
761
762
if (is_uv1_hub())
763
uv1_throttle(hmaster, stat);
764
765
while (hmaster->uvhub_quiesce)
766
cpu_relax();
767
768
time1 = get_cycles();
769
do {
770
if (try == 0) {
771
bau_desc->header.msg_type = MSG_REGULAR;
772
seq_number = bcp->message_number++;
773
} else {
774
bau_desc->header.msg_type = MSG_RETRY;
775
stat->s_retry_messages++;
776
}
777
778
bau_desc->header.sequence = seq_number;
779
index = (1UL << AS_PUSH_SHIFT) | bcp->uvhub_cpu;
780
bcp->send_message = get_cycles();
781
782
write_mmr_activation(index);
783
784
try++;
785
completion_stat = wait_completion(bau_desc, bcp, try);
786
787
handle_cmplt(completion_stat, bau_desc, bcp, hmaster, stat);
788
789
if (bcp->ipi_attempts >= bcp->ipi_reset_limit) {
790
bcp->ipi_attempts = 0;
791
completion_stat = FLUSH_GIVEUP;
792
break;
793
}
794
cpu_relax();
795
} while ((completion_stat == FLUSH_RETRY_PLUGGED) ||
796
(completion_stat == FLUSH_RETRY_TIMEOUT));
797
798
time2 = get_cycles();
799
800
count_max_concurr(completion_stat, bcp, hmaster);
801
802
while (hmaster->uvhub_quiesce)
803
cpu_relax();
804
805
atomic_dec(&hmaster->active_descriptor_count);
806
807
record_send_stats(time1, time2, bcp, stat, completion_stat, try);
808
809
if (completion_stat == FLUSH_GIVEUP)
810
return 1;
811
return 0;
812
}
813
814
/*
815
* The BAU is disabled. When the disabled time period has expired, the cpu
816
* that disabled it must re-enable it.
817
* Return 0 if it is re-enabled for all cpus.
818
*/
819
static int check_enable(struct bau_control *bcp, struct ptc_stats *stat)
820
{
821
int tcpu;
822
struct bau_control *tbcp;
823
824
if (bcp->set_bau_off) {
825
if (get_cycles() >= bcp->set_bau_on_time) {
826
stat->s_bau_reenabled++;
827
baudisabled = 0;
828
for_each_present_cpu(tcpu) {
829
tbcp = &per_cpu(bau_control, tcpu);
830
tbcp->baudisabled = 0;
831
tbcp->period_requests = 0;
832
tbcp->period_time = 0;
833
}
834
return 0;
835
}
836
}
837
return -1;
838
}
839
840
static void record_send_statistics(struct ptc_stats *stat, int locals, int hubs,
841
int remotes, struct bau_desc *bau_desc)
842
{
843
stat->s_requestor++;
844
stat->s_ntargcpu += remotes + locals;
845
stat->s_ntargremotes += remotes;
846
stat->s_ntarglocals += locals;
847
848
/* uvhub statistics */
849
hubs = bau_uvhub_weight(&bau_desc->distribution);
850
if (locals) {
851
stat->s_ntarglocaluvhub++;
852
stat->s_ntargremoteuvhub += (hubs - 1);
853
} else
854
stat->s_ntargremoteuvhub += hubs;
855
856
stat->s_ntarguvhub += hubs;
857
858
if (hubs >= 16)
859
stat->s_ntarguvhub16++;
860
else if (hubs >= 8)
861
stat->s_ntarguvhub8++;
862
else if (hubs >= 4)
863
stat->s_ntarguvhub4++;
864
else if (hubs >= 2)
865
stat->s_ntarguvhub2++;
866
else
867
stat->s_ntarguvhub1++;
868
}
869
870
/*
871
* Translate a cpu mask to the uvhub distribution mask in the BAU
872
* activation descriptor.
873
*/
874
static int set_distrib_bits(struct cpumask *flush_mask, struct bau_control *bcp,
875
struct bau_desc *bau_desc, int *localsp, int *remotesp)
876
{
877
int cpu;
878
int pnode;
879
int cnt = 0;
880
struct hub_and_pnode *hpp;
881
882
for_each_cpu(cpu, flush_mask) {
883
/*
884
* The distribution vector is a bit map of pnodes, relative
885
* to the partition base pnode (and the partition base nasid
886
* in the header).
887
* Translate cpu to pnode and hub using a local memory array.
888
*/
889
hpp = &bcp->socket_master->thp[cpu];
890
pnode = hpp->pnode - bcp->partition_base_pnode;
891
bau_uvhub_set(pnode, &bau_desc->distribution);
892
cnt++;
893
if (hpp->uvhub == bcp->uvhub)
894
(*localsp)++;
895
else
896
(*remotesp)++;
897
}
898
if (!cnt)
899
return 1;
900
return 0;
901
}
902
903
/*
904
* globally purge translation cache of a virtual address or all TLB's
905
* @cpumask: mask of all cpu's in which the address is to be removed
906
* @mm: mm_struct containing virtual address range
907
* @va: virtual address to be removed (or TLB_FLUSH_ALL for all TLB's on cpu)
908
* @cpu: the current cpu
909
*
910
* This is the entry point for initiating any UV global TLB shootdown.
911
*
912
* Purges the translation caches of all specified processors of the given
913
* virtual address, or purges all TLB's on specified processors.
914
*
915
* The caller has derived the cpumask from the mm_struct. This function
916
* is called only if there are bits set in the mask. (e.g. flush_tlb_page())
917
*
918
* The cpumask is converted into a uvhubmask of the uvhubs containing
919
* those cpus.
920
*
921
* Note that this function should be called with preemption disabled.
922
*
923
* Returns NULL if all remote flushing was done.
924
* Returns pointer to cpumask if some remote flushing remains to be
925
* done. The returned pointer is valid till preemption is re-enabled.
926
*/
927
const struct cpumask *uv_flush_tlb_others(const struct cpumask *cpumask,
928
struct mm_struct *mm, unsigned long va,
929
unsigned int cpu)
930
{
931
int locals = 0;
932
int remotes = 0;
933
int hubs = 0;
934
struct bau_desc *bau_desc;
935
struct cpumask *flush_mask;
936
struct ptc_stats *stat;
937
struct bau_control *bcp;
938
939
/* kernel was booted 'nobau' */
940
if (nobau)
941
return cpumask;
942
943
bcp = &per_cpu(bau_control, cpu);
944
stat = bcp->statp;
945
946
/* bau was disabled due to slow response */
947
if (bcp->baudisabled) {
948
if (check_enable(bcp, stat))
949
return cpumask;
950
}
951
952
/*
953
* Each sending cpu has a per-cpu mask which it fills from the caller's
954
* cpu mask. All cpus are converted to uvhubs and copied to the
955
* activation descriptor.
956
*/
957
flush_mask = (struct cpumask *)per_cpu(uv_flush_tlb_mask, cpu);
958
/* don't actually do a shootdown of the local cpu */
959
cpumask_andnot(flush_mask, cpumask, cpumask_of(cpu));
960
961
if (cpu_isset(cpu, *cpumask))
962
stat->s_ntargself++;
963
964
bau_desc = bcp->descriptor_base;
965
bau_desc += ITEMS_PER_DESC * bcp->uvhub_cpu;
966
bau_uvhubs_clear(&bau_desc->distribution, UV_DISTRIBUTION_SIZE);
967
if (set_distrib_bits(flush_mask, bcp, bau_desc, &locals, &remotes))
968
return NULL;
969
970
record_send_statistics(stat, locals, hubs, remotes, bau_desc);
971
972
bau_desc->payload.address = va;
973
bau_desc->payload.sending_cpu = cpu;
974
/*
975
* uv_flush_send_and_wait returns 0 if all cpu's were messaged,
976
* or 1 if it gave up and the original cpumask should be returned.
977
*/
978
if (!uv_flush_send_and_wait(bau_desc, flush_mask, bcp))
979
return NULL;
980
else
981
return cpumask;
982
}
983
984
/*
985
* The BAU message interrupt comes here. (registered by set_intr_gate)
986
* See entry_64.S
987
*
988
* We received a broadcast assist message.
989
*
990
* Interrupts are disabled; this interrupt could represent
991
* the receipt of several messages.
992
*
993
* All cores/threads on this hub get this interrupt.
994
* The last one to see it does the software ack.
995
* (the resource will not be freed until noninterruptable cpus see this
996
* interrupt; hardware may timeout the s/w ack and reply ERROR)
997
*/
998
void uv_bau_message_interrupt(struct pt_regs *regs)
999
{
1000
int count = 0;
1001
cycles_t time_start;
1002
struct bau_pq_entry *msg;
1003
struct bau_control *bcp;
1004
struct ptc_stats *stat;
1005
struct msg_desc msgdesc;
1006
1007
time_start = get_cycles();
1008
1009
bcp = &per_cpu(bau_control, smp_processor_id());
1010
stat = bcp->statp;
1011
1012
msgdesc.queue_first = bcp->queue_first;
1013
msgdesc.queue_last = bcp->queue_last;
1014
1015
msg = bcp->bau_msg_head;
1016
while (msg->swack_vec) {
1017
count++;
1018
1019
msgdesc.msg_slot = msg - msgdesc.queue_first;
1020
msgdesc.swack_slot = ffs(msg->swack_vec) - 1;
1021
msgdesc.msg = msg;
1022
bau_process_message(&msgdesc, bcp);
1023
1024
msg++;
1025
if (msg > msgdesc.queue_last)
1026
msg = msgdesc.queue_first;
1027
bcp->bau_msg_head = msg;
1028
}
1029
stat->d_time += (get_cycles() - time_start);
1030
if (!count)
1031
stat->d_nomsg++;
1032
else if (count > 1)
1033
stat->d_multmsg++;
1034
1035
ack_APIC_irq();
1036
}
1037
1038
/*
1039
* Each target uvhub (i.e. a uvhub that has cpu's) needs to have
1040
* shootdown message timeouts enabled. The timeout does not cause
1041
* an interrupt, but causes an error message to be returned to
1042
* the sender.
1043
*/
1044
static void __init enable_timeouts(void)
1045
{
1046
int uvhub;
1047
int nuvhubs;
1048
int pnode;
1049
unsigned long mmr_image;
1050
1051
nuvhubs = uv_num_possible_blades();
1052
1053
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1054
if (!uv_blade_nr_possible_cpus(uvhub))
1055
continue;
1056
1057
pnode = uv_blade_to_pnode(uvhub);
1058
mmr_image = read_mmr_misc_control(pnode);
1059
/*
1060
* Set the timeout period and then lock it in, in three
1061
* steps; captures and locks in the period.
1062
*
1063
* To program the period, the SOFT_ACK_MODE must be off.
1064
*/
1065
mmr_image &= ~(1L << SOFTACK_MSHIFT);
1066
write_mmr_misc_control(pnode, mmr_image);
1067
/*
1068
* Set the 4-bit period.
1069
*/
1070
mmr_image &= ~((unsigned long)0xf << SOFTACK_PSHIFT);
1071
mmr_image |= (SOFTACK_TIMEOUT_PERIOD << SOFTACK_PSHIFT);
1072
write_mmr_misc_control(pnode, mmr_image);
1073
/*
1074
* UV1:
1075
* Subsequent reversals of the timebase bit (3) cause an
1076
* immediate timeout of one or all INTD resources as
1077
* indicated in bits 2:0 (7 causes all of them to timeout).
1078
*/
1079
mmr_image |= (1L << SOFTACK_MSHIFT);
1080
if (is_uv2_hub()) {
1081
mmr_image |= (1L << UV2_LEG_SHFT);
1082
mmr_image |= (1L << UV2_EXT_SHFT);
1083
}
1084
write_mmr_misc_control(pnode, mmr_image);
1085
}
1086
}
1087
1088
static void *ptc_seq_start(struct seq_file *file, loff_t *offset)
1089
{
1090
if (*offset < num_possible_cpus())
1091
return offset;
1092
return NULL;
1093
}
1094
1095
static void *ptc_seq_next(struct seq_file *file, void *data, loff_t *offset)
1096
{
1097
(*offset)++;
1098
if (*offset < num_possible_cpus())
1099
return offset;
1100
return NULL;
1101
}
1102
1103
static void ptc_seq_stop(struct seq_file *file, void *data)
1104
{
1105
}
1106
1107
static inline unsigned long long usec_2_cycles(unsigned long microsec)
1108
{
1109
unsigned long ns;
1110
unsigned long long cyc;
1111
1112
ns = microsec * 1000;
1113
cyc = (ns << CYC2NS_SCALE_FACTOR)/(per_cpu(cyc2ns, smp_processor_id()));
1114
return cyc;
1115
}
1116
1117
/*
1118
* Display the statistics thru /proc/sgi_uv/ptc_statistics
1119
* 'data' points to the cpu number
1120
* Note: see the descriptions in stat_description[].
1121
*/
1122
static int ptc_seq_show(struct seq_file *file, void *data)
1123
{
1124
struct ptc_stats *stat;
1125
int cpu;
1126
1127
cpu = *(loff_t *)data;
1128
if (!cpu) {
1129
seq_printf(file,
1130
"# cpu sent stime self locals remotes ncpus localhub ");
1131
seq_printf(file,
1132
"remotehub numuvhubs numuvhubs16 numuvhubs8 ");
1133
seq_printf(file,
1134
"numuvhubs4 numuvhubs2 numuvhubs1 dto retries rok ");
1135
seq_printf(file,
1136
"resetp resett giveup sto bz throt swack recv rtime ");
1137
seq_printf(file,
1138
"all one mult none retry canc nocan reset rcan ");
1139
seq_printf(file,
1140
"disable enable\n");
1141
}
1142
if (cpu < num_possible_cpus() && cpu_online(cpu)) {
1143
stat = &per_cpu(ptcstats, cpu);
1144
/* source side statistics */
1145
seq_printf(file,
1146
"cpu %d %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1147
cpu, stat->s_requestor, cycles_2_us(stat->s_time),
1148
stat->s_ntargself, stat->s_ntarglocals,
1149
stat->s_ntargremotes, stat->s_ntargcpu,
1150
stat->s_ntarglocaluvhub, stat->s_ntargremoteuvhub,
1151
stat->s_ntarguvhub, stat->s_ntarguvhub16);
1152
seq_printf(file, "%ld %ld %ld %ld %ld ",
1153
stat->s_ntarguvhub8, stat->s_ntarguvhub4,
1154
stat->s_ntarguvhub2, stat->s_ntarguvhub1,
1155
stat->s_dtimeout);
1156
seq_printf(file, "%ld %ld %ld %ld %ld %ld %ld %ld ",
1157
stat->s_retry_messages, stat->s_retriesok,
1158
stat->s_resets_plug, stat->s_resets_timeout,
1159
stat->s_giveup, stat->s_stimeout,
1160
stat->s_busy, stat->s_throttles);
1161
1162
/* destination side statistics */
1163
seq_printf(file,
1164
"%lx %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld %ld ",
1165
read_gmmr_sw_ack(uv_cpu_to_pnode(cpu)),
1166
stat->d_requestee, cycles_2_us(stat->d_time),
1167
stat->d_alltlb, stat->d_onetlb, stat->d_multmsg,
1168
stat->d_nomsg, stat->d_retries, stat->d_canceled,
1169
stat->d_nocanceled, stat->d_resets,
1170
stat->d_rcanceled);
1171
seq_printf(file, "%ld %ld\n",
1172
stat->s_bau_disabled, stat->s_bau_reenabled);
1173
}
1174
return 0;
1175
}
1176
1177
/*
1178
* Display the tunables thru debugfs
1179
*/
1180
static ssize_t tunables_read(struct file *file, char __user *userbuf,
1181
size_t count, loff_t *ppos)
1182
{
1183
char *buf;
1184
int ret;
1185
1186
buf = kasprintf(GFP_KERNEL, "%s %s %s\n%d %d %d %d %d %d %d %d %d\n",
1187
"max_concur plugged_delay plugsb4reset",
1188
"timeoutsb4reset ipi_reset_limit complete_threshold",
1189
"congested_response_us congested_reps congested_period",
1190
max_concurr, plugged_delay, plugsb4reset,
1191
timeoutsb4reset, ipi_reset_limit, complete_threshold,
1192
congested_respns_us, congested_reps, congested_period);
1193
1194
if (!buf)
1195
return -ENOMEM;
1196
1197
ret = simple_read_from_buffer(userbuf, count, ppos, buf, strlen(buf));
1198
kfree(buf);
1199
return ret;
1200
}
1201
1202
/*
1203
* handle a write to /proc/sgi_uv/ptc_statistics
1204
* -1: reset the statistics
1205
* 0: display meaning of the statistics
1206
*/
1207
static ssize_t ptc_proc_write(struct file *file, const char __user *user,
1208
size_t count, loff_t *data)
1209
{
1210
int cpu;
1211
int i;
1212
int elements;
1213
long input_arg;
1214
char optstr[64];
1215
struct ptc_stats *stat;
1216
1217
if (count == 0 || count > sizeof(optstr))
1218
return -EINVAL;
1219
if (copy_from_user(optstr, user, count))
1220
return -EFAULT;
1221
optstr[count - 1] = '\0';
1222
1223
if (strict_strtol(optstr, 10, &input_arg) < 0) {
1224
printk(KERN_DEBUG "%s is invalid\n", optstr);
1225
return -EINVAL;
1226
}
1227
1228
if (input_arg == 0) {
1229
elements = sizeof(stat_description)/sizeof(*stat_description);
1230
printk(KERN_DEBUG "# cpu: cpu number\n");
1231
printk(KERN_DEBUG "Sender statistics:\n");
1232
for (i = 0; i < elements; i++)
1233
printk(KERN_DEBUG "%s\n", stat_description[i]);
1234
} else if (input_arg == -1) {
1235
for_each_present_cpu(cpu) {
1236
stat = &per_cpu(ptcstats, cpu);
1237
memset(stat, 0, sizeof(struct ptc_stats));
1238
}
1239
}
1240
1241
return count;
1242
}
1243
1244
static int local_atoi(const char *name)
1245
{
1246
int val = 0;
1247
1248
for (;; name++) {
1249
switch (*name) {
1250
case '0' ... '9':
1251
val = 10*val+(*name-'0');
1252
break;
1253
default:
1254
return val;
1255
}
1256
}
1257
}
1258
1259
/*
1260
* Parse the values written to /sys/kernel/debug/sgi_uv/bau_tunables.
1261
* Zero values reset them to defaults.
1262
*/
1263
static int parse_tunables_write(struct bau_control *bcp, char *instr,
1264
int count)
1265
{
1266
char *p;
1267
char *q;
1268
int cnt = 0;
1269
int val;
1270
int e = sizeof(tunables) / sizeof(*tunables);
1271
1272
p = instr + strspn(instr, WHITESPACE);
1273
q = p;
1274
for (; *p; p = q + strspn(q, WHITESPACE)) {
1275
q = p + strcspn(p, WHITESPACE);
1276
cnt++;
1277
if (q == p)
1278
break;
1279
}
1280
if (cnt != e) {
1281
printk(KERN_INFO "bau tunable error: should be %d values\n", e);
1282
return -EINVAL;
1283
}
1284
1285
p = instr + strspn(instr, WHITESPACE);
1286
q = p;
1287
for (cnt = 0; *p; p = q + strspn(q, WHITESPACE), cnt++) {
1288
q = p + strcspn(p, WHITESPACE);
1289
val = local_atoi(p);
1290
switch (cnt) {
1291
case 0:
1292
if (val == 0) {
1293
max_concurr = MAX_BAU_CONCURRENT;
1294
max_concurr_const = MAX_BAU_CONCURRENT;
1295
continue;
1296
}
1297
if (val < 1 || val > bcp->cpus_in_uvhub) {
1298
printk(KERN_DEBUG
1299
"Error: BAU max concurrent %d is invalid\n",
1300
val);
1301
return -EINVAL;
1302
}
1303
max_concurr = val;
1304
max_concurr_const = val;
1305
continue;
1306
default:
1307
if (val == 0)
1308
*tunables[cnt].tunp = tunables[cnt].deflt;
1309
else
1310
*tunables[cnt].tunp = val;
1311
continue;
1312
}
1313
if (q == p)
1314
break;
1315
}
1316
return 0;
1317
}
1318
1319
/*
1320
* Handle a write to debugfs. (/sys/kernel/debug/sgi_uv/bau_tunables)
1321
*/
1322
static ssize_t tunables_write(struct file *file, const char __user *user,
1323
size_t count, loff_t *data)
1324
{
1325
int cpu;
1326
int ret;
1327
char instr[100];
1328
struct bau_control *bcp;
1329
1330
if (count == 0 || count > sizeof(instr)-1)
1331
return -EINVAL;
1332
if (copy_from_user(instr, user, count))
1333
return -EFAULT;
1334
1335
instr[count] = '\0';
1336
1337
bcp = &per_cpu(bau_control, smp_processor_id());
1338
1339
ret = parse_tunables_write(bcp, instr, count);
1340
if (ret)
1341
return ret;
1342
1343
for_each_present_cpu(cpu) {
1344
bcp = &per_cpu(bau_control, cpu);
1345
bcp->max_concurr = max_concurr;
1346
bcp->max_concurr_const = max_concurr;
1347
bcp->plugged_delay = plugged_delay;
1348
bcp->plugsb4reset = plugsb4reset;
1349
bcp->timeoutsb4reset = timeoutsb4reset;
1350
bcp->ipi_reset_limit = ipi_reset_limit;
1351
bcp->complete_threshold = complete_threshold;
1352
bcp->cong_response_us = congested_respns_us;
1353
bcp->cong_reps = congested_reps;
1354
bcp->cong_period = congested_period;
1355
}
1356
return count;
1357
}
1358
1359
static const struct seq_operations uv_ptc_seq_ops = {
1360
.start = ptc_seq_start,
1361
.next = ptc_seq_next,
1362
.stop = ptc_seq_stop,
1363
.show = ptc_seq_show
1364
};
1365
1366
static int ptc_proc_open(struct inode *inode, struct file *file)
1367
{
1368
return seq_open(file, &uv_ptc_seq_ops);
1369
}
1370
1371
static int tunables_open(struct inode *inode, struct file *file)
1372
{
1373
return 0;
1374
}
1375
1376
static const struct file_operations proc_uv_ptc_operations = {
1377
.open = ptc_proc_open,
1378
.read = seq_read,
1379
.write = ptc_proc_write,
1380
.llseek = seq_lseek,
1381
.release = seq_release,
1382
};
1383
1384
static const struct file_operations tunables_fops = {
1385
.open = tunables_open,
1386
.read = tunables_read,
1387
.write = tunables_write,
1388
.llseek = default_llseek,
1389
};
1390
1391
static int __init uv_ptc_init(void)
1392
{
1393
struct proc_dir_entry *proc_uv_ptc;
1394
1395
if (!is_uv_system())
1396
return 0;
1397
1398
proc_uv_ptc = proc_create(UV_PTC_BASENAME, 0444, NULL,
1399
&proc_uv_ptc_operations);
1400
if (!proc_uv_ptc) {
1401
printk(KERN_ERR "unable to create %s proc entry\n",
1402
UV_PTC_BASENAME);
1403
return -EINVAL;
1404
}
1405
1406
tunables_dir = debugfs_create_dir(UV_BAU_TUNABLES_DIR, NULL);
1407
if (!tunables_dir) {
1408
printk(KERN_ERR "unable to create debugfs directory %s\n",
1409
UV_BAU_TUNABLES_DIR);
1410
return -EINVAL;
1411
}
1412
tunables_file = debugfs_create_file(UV_BAU_TUNABLES_FILE, 0600,
1413
tunables_dir, NULL, &tunables_fops);
1414
if (!tunables_file) {
1415
printk(KERN_ERR "unable to create debugfs file %s\n",
1416
UV_BAU_TUNABLES_FILE);
1417
return -EINVAL;
1418
}
1419
return 0;
1420
}
1421
1422
/*
1423
* Initialize the sending side's sending buffers.
1424
*/
1425
static void activation_descriptor_init(int node, int pnode, int base_pnode)
1426
{
1427
int i;
1428
int cpu;
1429
unsigned long pa;
1430
unsigned long m;
1431
unsigned long n;
1432
size_t dsize;
1433
struct bau_desc *bau_desc;
1434
struct bau_desc *bd2;
1435
struct bau_control *bcp;
1436
1437
/*
1438
* each bau_desc is 64 bytes; there are 8 (ITEMS_PER_DESC)
1439
* per cpu; and one per cpu on the uvhub (ADP_SZ)
1440
*/
1441
dsize = sizeof(struct bau_desc) * ADP_SZ * ITEMS_PER_DESC;
1442
bau_desc = kmalloc_node(dsize, GFP_KERNEL, node);
1443
BUG_ON(!bau_desc);
1444
1445
pa = uv_gpa(bau_desc); /* need the real nasid*/
1446
n = pa >> uv_nshift;
1447
m = pa & uv_mmask;
1448
1449
/* the 14-bit pnode */
1450
write_mmr_descriptor_base(pnode, (n << UV_DESC_PSHIFT | m));
1451
/*
1452
* Initializing all 8 (ITEMS_PER_DESC) descriptors for each
1453
* cpu even though we only use the first one; one descriptor can
1454
* describe a broadcast to 256 uv hubs.
1455
*/
1456
for (i = 0, bd2 = bau_desc; i < (ADP_SZ * ITEMS_PER_DESC); i++, bd2++) {
1457
memset(bd2, 0, sizeof(struct bau_desc));
1458
bd2->header.swack_flag = 1;
1459
/*
1460
* The base_dest_nasid set in the message header is the nasid
1461
* of the first uvhub in the partition. The bit map will
1462
* indicate destination pnode numbers relative to that base.
1463
* They may not be consecutive if nasid striding is being used.
1464
*/
1465
bd2->header.base_dest_nasid = UV_PNODE_TO_NASID(base_pnode);
1466
bd2->header.dest_subnodeid = UV_LB_SUBNODEID;
1467
bd2->header.command = UV_NET_ENDPOINT_INTD;
1468
bd2->header.int_both = 1;
1469
/*
1470
* all others need to be set to zero:
1471
* fairness chaining multilevel count replied_to
1472
*/
1473
}
1474
for_each_present_cpu(cpu) {
1475
if (pnode != uv_blade_to_pnode(uv_cpu_to_blade_id(cpu)))
1476
continue;
1477
bcp = &per_cpu(bau_control, cpu);
1478
bcp->descriptor_base = bau_desc;
1479
}
1480
}
1481
1482
/*
1483
* initialize the destination side's receiving buffers
1484
* entered for each uvhub in the partition
1485
* - node is first node (kernel memory notion) on the uvhub
1486
* - pnode is the uvhub's physical identifier
1487
*/
1488
static void pq_init(int node, int pnode)
1489
{
1490
int cpu;
1491
size_t plsize;
1492
char *cp;
1493
void *vp;
1494
unsigned long pn;
1495
unsigned long first;
1496
unsigned long pn_first;
1497
unsigned long last;
1498
struct bau_pq_entry *pqp;
1499
struct bau_control *bcp;
1500
1501
plsize = (DEST_Q_SIZE + 1) * sizeof(struct bau_pq_entry);
1502
vp = kmalloc_node(plsize, GFP_KERNEL, node);
1503
pqp = (struct bau_pq_entry *)vp;
1504
BUG_ON(!pqp);
1505
1506
cp = (char *)pqp + 31;
1507
pqp = (struct bau_pq_entry *)(((unsigned long)cp >> 5) << 5);
1508
1509
for_each_present_cpu(cpu) {
1510
if (pnode != uv_cpu_to_pnode(cpu))
1511
continue;
1512
/* for every cpu on this pnode: */
1513
bcp = &per_cpu(bau_control, cpu);
1514
bcp->queue_first = pqp;
1515
bcp->bau_msg_head = pqp;
1516
bcp->queue_last = pqp + (DEST_Q_SIZE - 1);
1517
}
1518
/*
1519
* need the pnode of where the memory was really allocated
1520
*/
1521
pn = uv_gpa(pqp) >> uv_nshift;
1522
first = uv_physnodeaddr(pqp);
1523
pn_first = ((unsigned long)pn << UV_PAYLOADQ_PNODE_SHIFT) | first;
1524
last = uv_physnodeaddr(pqp + (DEST_Q_SIZE - 1));
1525
write_mmr_payload_first(pnode, pn_first);
1526
write_mmr_payload_tail(pnode, first);
1527
write_mmr_payload_last(pnode, last);
1528
1529
/* in effect, all msg_type's are set to MSG_NOOP */
1530
memset(pqp, 0, sizeof(struct bau_pq_entry) * DEST_Q_SIZE);
1531
}
1532
1533
/*
1534
* Initialization of each UV hub's structures
1535
*/
1536
static void __init init_uvhub(int uvhub, int vector, int base_pnode)
1537
{
1538
int node;
1539
int pnode;
1540
unsigned long apicid;
1541
1542
node = uvhub_to_first_node(uvhub);
1543
pnode = uv_blade_to_pnode(uvhub);
1544
1545
activation_descriptor_init(node, pnode, base_pnode);
1546
1547
pq_init(node, pnode);
1548
/*
1549
* The below initialization can't be in firmware because the
1550
* messaging IRQ will be determined by the OS.
1551
*/
1552
apicid = uvhub_to_first_apicid(uvhub) | uv_apicid_hibits;
1553
write_mmr_data_config(pnode, ((apicid << 32) | vector));
1554
}
1555
1556
/*
1557
* We will set BAU_MISC_CONTROL with a timeout period.
1558
* But the BIOS has set UVH_AGING_PRESCALE_SEL and UVH_TRANSACTION_TIMEOUT.
1559
* So the destination timeout period has to be calculated from them.
1560
*/
1561
static int calculate_destination_timeout(void)
1562
{
1563
unsigned long mmr_image;
1564
int mult1;
1565
int mult2;
1566
int index;
1567
int base;
1568
int ret;
1569
unsigned long ts_ns;
1570
1571
if (is_uv1_hub()) {
1572
mult1 = SOFTACK_TIMEOUT_PERIOD & BAU_MISC_CONTROL_MULT_MASK;
1573
mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1574
index = (mmr_image >> BAU_URGENCY_7_SHIFT) & BAU_URGENCY_7_MASK;
1575
mmr_image = uv_read_local_mmr(UVH_TRANSACTION_TIMEOUT);
1576
mult2 = (mmr_image >> BAU_TRANS_SHIFT) & BAU_TRANS_MASK;
1577
base = timeout_base_ns[index];
1578
ts_ns = base * mult1 * mult2;
1579
ret = ts_ns / 1000;
1580
} else {
1581
/* 4 bits 0/1 for 10/80us, 3 bits of multiplier */
1582
mmr_image = uv_read_local_mmr(UVH_AGING_PRESCALE_SEL);
1583
mmr_image = (mmr_image & UV_SA_MASK) >> UV_SA_SHFT;
1584
if (mmr_image & (1L << UV2_ACK_UNITS_SHFT))
1585
mult1 = 80;
1586
else
1587
mult1 = 10;
1588
base = mmr_image & UV2_ACK_MASK;
1589
ret = mult1 * base;
1590
}
1591
return ret;
1592
}
1593
1594
static void __init init_per_cpu_tunables(void)
1595
{
1596
int cpu;
1597
struct bau_control *bcp;
1598
1599
for_each_present_cpu(cpu) {
1600
bcp = &per_cpu(bau_control, cpu);
1601
bcp->baudisabled = 0;
1602
bcp->statp = &per_cpu(ptcstats, cpu);
1603
/* time interval to catch a hardware stay-busy bug */
1604
bcp->timeout_interval = usec_2_cycles(2*timeout_us);
1605
bcp->max_concurr = max_concurr;
1606
bcp->max_concurr_const = max_concurr;
1607
bcp->plugged_delay = plugged_delay;
1608
bcp->plugsb4reset = plugsb4reset;
1609
bcp->timeoutsb4reset = timeoutsb4reset;
1610
bcp->ipi_reset_limit = ipi_reset_limit;
1611
bcp->complete_threshold = complete_threshold;
1612
bcp->cong_response_us = congested_respns_us;
1613
bcp->cong_reps = congested_reps;
1614
bcp->cong_period = congested_period;
1615
}
1616
}
1617
1618
/*
1619
* Scan all cpus to collect blade and socket summaries.
1620
*/
1621
static int __init get_cpu_topology(int base_pnode,
1622
struct uvhub_desc *uvhub_descs,
1623
unsigned char *uvhub_mask)
1624
{
1625
int cpu;
1626
int pnode;
1627
int uvhub;
1628
int socket;
1629
struct bau_control *bcp;
1630
struct uvhub_desc *bdp;
1631
struct socket_desc *sdp;
1632
1633
for_each_present_cpu(cpu) {
1634
bcp = &per_cpu(bau_control, cpu);
1635
1636
memset(bcp, 0, sizeof(struct bau_control));
1637
1638
pnode = uv_cpu_hub_info(cpu)->pnode;
1639
if ((pnode - base_pnode) >= UV_DISTRIBUTION_SIZE) {
1640
printk(KERN_EMERG
1641
"cpu %d pnode %d-%d beyond %d; BAU disabled\n",
1642
cpu, pnode, base_pnode, UV_DISTRIBUTION_SIZE);
1643
return 1;
1644
}
1645
1646
bcp->osnode = cpu_to_node(cpu);
1647
bcp->partition_base_pnode = base_pnode;
1648
1649
uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1650
*(uvhub_mask + (uvhub/8)) |= (1 << (uvhub%8));
1651
bdp = &uvhub_descs[uvhub];
1652
1653
bdp->num_cpus++;
1654
bdp->uvhub = uvhub;
1655
bdp->pnode = pnode;
1656
1657
/* kludge: 'assuming' one node per socket, and assuming that
1658
disabling a socket just leaves a gap in node numbers */
1659
socket = bcp->osnode & 1;
1660
bdp->socket_mask |= (1 << socket);
1661
sdp = &bdp->socket[socket];
1662
sdp->cpu_number[sdp->num_cpus] = cpu;
1663
sdp->num_cpus++;
1664
if (sdp->num_cpus > MAX_CPUS_PER_SOCKET) {
1665
printk(KERN_EMERG "%d cpus per socket invalid\n",
1666
sdp->num_cpus);
1667
return 1;
1668
}
1669
}
1670
return 0;
1671
}
1672
1673
/*
1674
* Each socket is to get a local array of pnodes/hubs.
1675
*/
1676
static void make_per_cpu_thp(struct bau_control *smaster)
1677
{
1678
int cpu;
1679
size_t hpsz = sizeof(struct hub_and_pnode) * num_possible_cpus();
1680
1681
smaster->thp = kmalloc_node(hpsz, GFP_KERNEL, smaster->osnode);
1682
memset(smaster->thp, 0, hpsz);
1683
for_each_present_cpu(cpu) {
1684
smaster->thp[cpu].pnode = uv_cpu_hub_info(cpu)->pnode;
1685
smaster->thp[cpu].uvhub = uv_cpu_hub_info(cpu)->numa_blade_id;
1686
}
1687
}
1688
1689
/*
1690
* Initialize all the per_cpu information for the cpu's on a given socket,
1691
* given what has been gathered into the socket_desc struct.
1692
* And reports the chosen hub and socket masters back to the caller.
1693
*/
1694
static int scan_sock(struct socket_desc *sdp, struct uvhub_desc *bdp,
1695
struct bau_control **smasterp,
1696
struct bau_control **hmasterp)
1697
{
1698
int i;
1699
int cpu;
1700
struct bau_control *bcp;
1701
1702
for (i = 0; i < sdp->num_cpus; i++) {
1703
cpu = sdp->cpu_number[i];
1704
bcp = &per_cpu(bau_control, cpu);
1705
bcp->cpu = cpu;
1706
if (i == 0) {
1707
*smasterp = bcp;
1708
if (!(*hmasterp))
1709
*hmasterp = bcp;
1710
}
1711
bcp->cpus_in_uvhub = bdp->num_cpus;
1712
bcp->cpus_in_socket = sdp->num_cpus;
1713
bcp->socket_master = *smasterp;
1714
bcp->uvhub = bdp->uvhub;
1715
bcp->uvhub_master = *hmasterp;
1716
bcp->uvhub_cpu = uv_cpu_hub_info(cpu)->blade_processor_id;
1717
if (bcp->uvhub_cpu >= MAX_CPUS_PER_UVHUB) {
1718
printk(KERN_EMERG "%d cpus per uvhub invalid\n",
1719
bcp->uvhub_cpu);
1720
return 1;
1721
}
1722
}
1723
return 0;
1724
}
1725
1726
/*
1727
* Summarize the blade and socket topology into the per_cpu structures.
1728
*/
1729
static int __init summarize_uvhub_sockets(int nuvhubs,
1730
struct uvhub_desc *uvhub_descs,
1731
unsigned char *uvhub_mask)
1732
{
1733
int socket;
1734
int uvhub;
1735
unsigned short socket_mask;
1736
1737
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1738
struct uvhub_desc *bdp;
1739
struct bau_control *smaster = NULL;
1740
struct bau_control *hmaster = NULL;
1741
1742
if (!(*(uvhub_mask + (uvhub/8)) & (1 << (uvhub%8))))
1743
continue;
1744
1745
bdp = &uvhub_descs[uvhub];
1746
socket_mask = bdp->socket_mask;
1747
socket = 0;
1748
while (socket_mask) {
1749
struct socket_desc *sdp;
1750
if ((socket_mask & 1)) {
1751
sdp = &bdp->socket[socket];
1752
if (scan_sock(sdp, bdp, &smaster, &hmaster))
1753
return 1;
1754
}
1755
socket++;
1756
socket_mask = (socket_mask >> 1);
1757
make_per_cpu_thp(smaster);
1758
}
1759
}
1760
return 0;
1761
}
1762
1763
/*
1764
* initialize the bau_control structure for each cpu
1765
*/
1766
static int __init init_per_cpu(int nuvhubs, int base_part_pnode)
1767
{
1768
unsigned char *uvhub_mask;
1769
void *vp;
1770
struct uvhub_desc *uvhub_descs;
1771
1772
timeout_us = calculate_destination_timeout();
1773
1774
vp = kmalloc(nuvhubs * sizeof(struct uvhub_desc), GFP_KERNEL);
1775
uvhub_descs = (struct uvhub_desc *)vp;
1776
memset(uvhub_descs, 0, nuvhubs * sizeof(struct uvhub_desc));
1777
uvhub_mask = kzalloc((nuvhubs+7)/8, GFP_KERNEL);
1778
1779
if (get_cpu_topology(base_part_pnode, uvhub_descs, uvhub_mask))
1780
return 1;
1781
1782
if (summarize_uvhub_sockets(nuvhubs, uvhub_descs, uvhub_mask))
1783
return 1;
1784
1785
kfree(uvhub_descs);
1786
kfree(uvhub_mask);
1787
init_per_cpu_tunables();
1788
return 0;
1789
}
1790
1791
/*
1792
* Initialization of BAU-related structures
1793
*/
1794
static int __init uv_bau_init(void)
1795
{
1796
int uvhub;
1797
int pnode;
1798
int nuvhubs;
1799
int cur_cpu;
1800
int cpus;
1801
int vector;
1802
cpumask_var_t *mask;
1803
1804
if (!is_uv_system())
1805
return 0;
1806
1807
if (nobau)
1808
return 0;
1809
1810
for_each_possible_cpu(cur_cpu) {
1811
mask = &per_cpu(uv_flush_tlb_mask, cur_cpu);
1812
zalloc_cpumask_var_node(mask, GFP_KERNEL, cpu_to_node(cur_cpu));
1813
}
1814
1815
uv_nshift = uv_hub_info->m_val;
1816
uv_mmask = (1UL << uv_hub_info->m_val) - 1;
1817
nuvhubs = uv_num_possible_blades();
1818
spin_lock_init(&disable_lock);
1819
congested_cycles = usec_2_cycles(congested_respns_us);
1820
1821
uv_base_pnode = 0x7fffffff;
1822
for (uvhub = 0; uvhub < nuvhubs; uvhub++) {
1823
cpus = uv_blade_nr_possible_cpus(uvhub);
1824
if (cpus && (uv_blade_to_pnode(uvhub) < uv_base_pnode))
1825
uv_base_pnode = uv_blade_to_pnode(uvhub);
1826
}
1827
1828
if (init_per_cpu(nuvhubs, uv_base_pnode)) {
1829
nobau = 1;
1830
return 0;
1831
}
1832
1833
vector = UV_BAU_MESSAGE;
1834
for_each_possible_blade(uvhub)
1835
if (uv_blade_nr_possible_cpus(uvhub))
1836
init_uvhub(uvhub, vector, uv_base_pnode);
1837
1838
enable_timeouts();
1839
alloc_intr_gate(vector, uv_bau_message_intr1);
1840
1841
for_each_possible_blade(uvhub) {
1842
if (uv_blade_nr_possible_cpus(uvhub)) {
1843
unsigned long val;
1844
unsigned long mmr;
1845
pnode = uv_blade_to_pnode(uvhub);
1846
/* INIT the bau */
1847
val = 1L << 63;
1848
write_gmmr_activation(pnode, val);
1849
mmr = 1; /* should be 1 to broadcast to both sockets */
1850
write_mmr_data_broadcast(pnode, mmr);
1851
}
1852
}
1853
1854
return 0;
1855
}
1856
core_initcall(uv_bau_init);
1857
fs_initcall(uv_ptc_init);
1858
1859