Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
ElmerCSC
GitHub Repository: ElmerCSC/elmerfem
Path: blob/devel/mathlibs/src/parpack/psnaitr.f
5198 views
1
c-----------------------------------------------------------------------
2
c\BeginDoc
3
c
4
c\Name: psnaitr
5
c
6
c Message Passing Layer: MPI
7
c
8
c\Description:
9
c Reverse communication interface for applying NP additional steps to
10
c a K step nonsymmetric Arnoldi factorization.
11
c
12
c Input: OP*V_{k} - V_{k}*H = r_{k}*e_{k}^T
13
c
14
c with (V_{k}^T)*B*V_{k} = I, (V_{k}^T)*B*r_{k} = 0.
15
c
16
c Output: OP*V_{k+p} - V_{k+p}*H = r_{k+p}*e_{k+p}^T
17
c
18
c with (V_{k+p}^T)*B*V_{k+p} = I, (V_{k+p}^T)*B*r_{k+p} = 0.
19
c
20
c where OP and B are as in psnaupd. The B-norm of r_{k+p} is also
21
c computed and returned.
22
c
23
c\Usage:
24
c call psnaitr
25
c ( COMM, IDO, BMAT, N, K, NP, NB, RESID, RNORM, V, LDV, H, LDH,
26
c IPNTR, WORKD, WORKL, INFO )
27
c
28
c\Arguments
29
c COMM MPI Communicator for the processor grid. (INPUT)
30
c
31
c IDO Integer. (INPUT/OUTPUT)
32
c Reverse communication flag.
33
c -------------------------------------------------------------
34
c IDO = 0: first call to the reverse communication interface
35
c IDO = -1: compute Y = OP * X where
36
c IPNTR(1) is the pointer into WORK for X,
37
c IPNTR(2) is the pointer into WORK for Y.
38
c This is for the restart phase to force the new
39
c starting vector into the range of OP.
40
c IDO = 1: compute Y = OP * X where
41
c IPNTR(1) is the pointer into WORK for X,
42
c IPNTR(2) is the pointer into WORK for Y,
43
c IPNTR(3) is the pointer into WORK for B * X.
44
c IDO = 2: compute Y = B * X where
45
c IPNTR(1) is the pointer into WORK for X,
46
c IPNTR(2) is the pointer into WORK for Y.
47
c IDO = 99: done
48
c -------------------------------------------------------------
49
c When the routine is used in the "shift-and-invert" mode, the
50
c vector B * Q is already available and do not need to be
51
c recompute in forming OP * Q.
52
c
53
c BMAT Character*1. (INPUT)
54
c BMAT specifies the type of the matrix B that defines the
55
c semi-inner product for the operator OP. See psnaupd.
56
c B = 'I' -> standard eigenvalue problem A*x = lambda*x
57
c B = 'G' -> generalized eigenvalue problem A*x = lambda*M**x
58
c
59
c N Integer. (INPUT)
60
c Dimension of the eigenproblem.
61
c
62
c K Integer. (INPUT)
63
c Current size of V and H.
64
c
65
c NP Integer. (INPUT)
66
c Number of additional Arnoldi steps to take.
67
c
68
c NB Integer. (INPUT)
69
c Blocksize to be used in the recurrence.
70
c Only work for NB = 1 right now. The goal is to have a
71
c program that implement both the block and non-block method.
72
c
73
c RESID Real array of length N. (INPUT/OUTPUT)
74
c On INPUT: RESID contains the residual vector r_{k}.
75
c On OUTPUT: RESID contains the residual vector r_{k+p}.
76
c
77
c RNORM Real scalar. (INPUT/OUTPUT)
78
c B-norm of the starting residual on input.
79
c B-norm of the updated residual r_{k+p} on output.
80
c
81
c V Real N by K+NP array. (INPUT/OUTPUT)
82
c On INPUT: V contains the Arnoldi vectors in the first K
83
c columns.
84
c On OUTPUT: V contains the new NP Arnoldi vectors in the next
85
c NP columns. The first K columns are unchanged.
86
c
87
c LDV Integer. (INPUT)
88
c Leading dimension of V exactly as declared in the calling
89
c program.
90
c
91
c H Real (K+NP) by (K+NP) array. (INPUT/OUTPUT)
92
c H is used to store the generated upper Hessenberg matrix.
93
c
94
c LDH Integer. (INPUT)
95
c Leading dimension of H exactly as declared in the calling
96
c program.
97
c
98
c IPNTR Integer array of length 3. (OUTPUT)
99
c Pointer to mark the starting locations in the WORK for
100
c vectors used by the Arnoldi iteration.
101
c -------------------------------------------------------------
102
c IPNTR(1): pointer to the current operand vector X.
103
c IPNTR(2): pointer to the current result vector Y.
104
c IPNTR(3): pointer to the vector B * X when used in the
105
c shift-and-invert mode. X is the current operand.
106
c -------------------------------------------------------------
107
c
108
c WORKD Real work array of length 3*N. (REVERSE COMMUNICATION)
109
c Distributed array to be used in the basic Arnoldi iteration
110
c for reverse communication. The calling program should not
111
c use WORKD as temporary workspace during the iteration !!!!!!
112
c On input, WORKD(1:N) = B*RESID and is used to save some
113
c computation at the first step.
114
c
115
c WORKL Real work space used for Gram Schmidt orthogonalization
116
c
117
c INFO Integer. (OUTPUT)
118
c = 0: Normal exit.
119
c > 0: Size of the spanning invariant subspace of OP found.
120
c
121
c\EndDoc
122
c
123
c-----------------------------------------------------------------------
124
c
125
c\BeginLib
126
c
127
c\Local variables:
128
c xxxxxx real
129
c
130
c\References:
131
c 1. D.C. Sorensen, "Implicit Application of Polynomial Filters in
132
c a k-Step Arnoldi Method", SIAM J. Matr. Anal. Apps., 13 (1992),
133
c pp 357-385.
134
c 2. R.B. Lehoucq, "Analysis and Implementation of an Implicitly
135
c Restarted Arnoldi Iteration", Rice University Technical Report
136
c TR95-13, Department of Computational and Applied Mathematics.
137
c
138
c\Routines called:
139
c psgetv0 Parallel ARPACK routine to generate the initial vector.
140
c pivout Parallel ARPACK utility routine that prints integers.
141
c second ARPACK utility routine for timing.
142
c psmout Parallel ARPACK utility routine that prints matrices
143
c psvout Parallel ARPACK utility routine that prints vectors.
144
c slabad LAPACK routine that computes machine constants.
145
c pslamch ScaLAPACK routine that determines machine constants.
146
c slascl LAPACK routine for careful scaling of a matrix.
147
c slanhs LAPACK routine that computes various norms of a matrix.
148
c sgemv Level 2 BLAS routine for matrix vector multiplication.
149
c saxpy Level 1 BLAS that computes a vector triad.
150
c sscal Level 1 BLAS that scales a vector.
151
c scopy Level 1 BLAS that copies one vector to another .
152
c sdot Level 1 BLAS that computes the scalar product of two vectors.
153
c psnorm2 Parallel version of Level 1 BLAS that computes the norm of a vector.
154
c
155
c\Author
156
c Danny Sorensen Phuong Vu
157
c Richard Lehoucq CRPC / Rice University
158
c Dept. of Computational & Houston, Texas
159
c Applied Mathematics
160
c Rice University
161
c Houston, Texas
162
c
163
c\Parallel Modifications
164
c Kristi Maschhoff
165
c
166
c\Revision history:
167
c Starting Point: Serial Code FILE: naitr.F SID: 2.2
168
c
169
c\SCCS Information:
170
c FILE: naitr.F SID: 1.3 DATE OF SID: 3/19/97
171
c
172
c\Remarks
173
c The algorithm implemented is:
174
c
175
c restart = .false.
176
c Given V_{k} = [v_{1}, ..., v_{k}], r_{k};
177
c r_{k} contains the initial residual vector even for k = 0;
178
c Also assume that rnorm = || B*r_{k} || and B*r_{k} are already
179
c computed by the calling program.
180
c
181
c betaj = rnorm ; p_{k+1} = B*r_{k} ;
182
c For j = k+1, ..., k+np Do
183
c 1) if ( betaj < tol ) stop or restart depending on j.
184
c ( At present tol is zero )
185
c if ( restart ) generate a new starting vector.
186
c 2) v_{j} = r(j-1)/betaj; V_{j} = [V_{j-1}, v_{j}];
187
c p_{j} = p_{j}/betaj
188
c 3) r_{j} = OP*v_{j} where OP is defined as in psnaupd
189
c For shift-invert mode p_{j} = B*v_{j} is already available.
190
c wnorm = || OP*v_{j} ||
191
c 4) Compute the j-th step residual vector.
192
c w_{j} = V_{j}^T * B * OP * v_{j}
193
c r_{j} = OP*v_{j} - V_{j} * w_{j}
194
c H(:,j) = w_{j};
195
c H(j,j-1) = rnorm
196
c rnorm = || r_(j) ||
197
c If (rnorm > 0.717*wnorm) accept step and go back to 1)
198
c 5) Re-orthogonalization step:
199
c s = V_{j}'*B*r_{j}
200
c r_{j} = r_{j} - V_{j}*s; rnorm1 = || r_{j} ||
201
c alphaj = alphaj + s_{j};
202
c 6) Iterative refinement step:
203
c If (rnorm1 > 0.717*rnorm) then
204
c rnorm = rnorm1
205
c accept step and go back to 1)
206
c Else
207
c rnorm = rnorm1
208
c If this is the first time in step 6), go to 5)
209
c Else r_{j} lies in the span of V_{j} numerically.
210
c Set r_{j} = 0 and rnorm = 0; go to 1)
211
c EndIf
212
c End Do
213
c
214
c\EndLib
215
c
216
c-----------------------------------------------------------------------
217
c
218
subroutine psnaitr
219
& (comm, ido, bmat, n, k, np, nb, resid, rnorm, v, ldv, h, ldh,
220
& ipntr, workd, workl, info)
221
c
222
include 'mpif.h'
223
c
224
c %---------------%
225
c | MPI Variables |
226
c %---------------%
227
c
228
integer comm
229
c
230
c %----------------------------------------------------%
231
c | Include files for debugging and timing information |
232
c %----------------------------------------------------%
233
c
234
include 'debug.h'
235
include 'stat.h'
236
c
237
c %------------------%
238
c | Scalar Arguments |
239
c %------------------%
240
c
241
character bmat*1
242
integer ido, info, k, ldh, ldv, n, nb, np
243
Real
244
& rnorm
245
c
246
c %-----------------%
247
c | Array Arguments |
248
c %-----------------%
249
c
250
integer ipntr(3)
251
Real
252
& h(ldh,k+np), resid(n), v(ldv,k+np), workd(3*n),
253
& workl(2*ldh)
254
c
255
c %------------%
256
c | Parameters |
257
c %------------%
258
c
259
Real
260
& one, zero
261
parameter (one = 1.0, zero = 0.0)
262
c
263
c %---------------%
264
c | Local Scalars |
265
c %---------------%
266
c
267
logical first, orth1, orth2, rstart, step3, step4
268
integer ierr, i, infol, ipj, irj, ivj, iter, itry, j, msglvl,
269
& jj
270
Real
271
& betaj, ovfl, temp1, rnorm1, smlnum, tst1, ulp, unfl,
272
& wnorm
273
save first, orth1, orth2, rstart, step3, step4,
274
& ierr, ipj, irj, ivj, iter, itry, j, msglvl, ovfl,
275
& betaj, rnorm1, smlnum, ulp, unfl, wnorm
276
c
277
Real
278
& rnorm_buf
279
c
280
c
281
c %-----------------------%
282
c | Local Array Arguments |
283
c %-----------------------%
284
c
285
Real
286
& xtemp(2)
287
c
288
c %----------------------%
289
c | External Subroutines |
290
c %----------------------%
291
c
292
external saxpy, scopy, sscal, sgemv, psgetv0, slabad,
293
& psvout, psmout, pivout, second
294
c
295
c %--------------------%
296
c | External Functions |
297
c %--------------------%
298
c
299
Real
300
& sdot, psnorm2, slanhs, pslamch10
301
external sdot, psnorm2, slanhs, pslamch10
302
c
303
c %---------------------%
304
c | Intrinsic Functions |
305
c %---------------------%
306
c
307
intrinsic abs, sqrt
308
c
309
c %-----------------%
310
c | Data statements |
311
c %-----------------%
312
c
313
data first / .true. /
314
c
315
c %-----------------------%
316
c | Executable Statements |
317
c %-----------------------%
318
c
319
if (first) then
320
c
321
c %-----------------------------------------%
322
c | Set machine-dependent constants for the |
323
c | the splitting and deflation criterion. |
324
c | If norm(H) <= sqrt(OVFL), |
325
c | overflow should not occur. |
326
c | REFERENCE: LAPACK subroutine slahqr |
327
c %-----------------------------------------%
328
c
329
unfl = pslamch10(comm, 'safe minimum' )
330
ovfl = one / unfl
331
call slabad( unfl, ovfl )
332
ulp = pslamch10( comm, 'precision' )
333
smlnum = unfl*( n / ulp )
334
first = .false.
335
end if
336
c
337
if (ido .eq. 0) then
338
c
339
c %-------------------------------%
340
c | Initialize timing statistics |
341
c | & message level for debugging |
342
c %-------------------------------%
343
c
344
call second (t0)
345
msglvl = mnaitr
346
c
347
c %------------------------------%
348
c | Initial call to this routine |
349
c %------------------------------%
350
c
351
info = 0
352
step3 = .false.
353
step4 = .false.
354
rstart = .false.
355
orth1 = .false.
356
orth2 = .false.
357
j = k + 1
358
ipj = 1
359
irj = ipj + n
360
ivj = irj + n
361
end if
362
c
363
c %-------------------------------------------------%
364
c | When in reverse communication mode one of: |
365
c | STEP3, STEP4, ORTH1, ORTH2, RSTART |
366
c | will be .true. when .... |
367
c | STEP3: return from computing OP*v_{j}. |
368
c | STEP4: return from computing B-norm of OP*v_{j} |
369
c | ORTH1: return from computing B-norm of r_{j+1} |
370
c | ORTH2: return from computing B-norm of |
371
c | correction to the residual vector. |
372
c | RSTART: return from OP computations needed by |
373
c | psgetv0. |
374
c %-------------------------------------------------%
375
c
376
if (step3) go to 50
377
if (step4) go to 60
378
if (orth1) go to 70
379
if (orth2) go to 90
380
if (rstart) go to 30
381
c
382
c %-----------------------------%
383
c | Else this is the first step |
384
c %-----------------------------%
385
c
386
c %--------------------------------------------------------------%
387
c | |
388
c | A R N O L D I I T E R A T I O N L O O P |
389
c | |
390
c | Note: B*r_{j-1} is already in WORKD(1:N)=WORKD(IPJ:IPJ+N-1) |
391
c %--------------------------------------------------------------%
392
393
1000 continue
394
c
395
if (msglvl .gt. 1) then
396
call pivout (comm, logfil, 1, [j], ndigit,
397
& '_naitr: generating Arnoldi vector number')
398
call psvout (comm, logfil, 1, [rnorm], ndigit,
399
& '_naitr: B-norm of the current residual is')
400
end if
401
c
402
c %---------------------------------------------------%
403
c | STEP 1: Check if the B norm of j-th residual |
404
c | vector is zero. Equivalent to determing whether |
405
c | an exact j-step Arnoldi factorization is present. |
406
c %---------------------------------------------------%
407
c
408
betaj = rnorm
409
if (rnorm .gt. zero) go to 40
410
c
411
c %---------------------------------------------------%
412
c | Invariant subspace found, generate a new starting |
413
c | vector which is orthogonal to the current Arnoldi |
414
c | basis and continue the iteration. |
415
c %---------------------------------------------------%
416
c
417
if (msglvl .gt. 0) then
418
call pivout (comm, logfil, 1, [j], ndigit,
419
& '_naitr: ****** RESTART AT STEP ******')
420
end if
421
c
422
c %---------------------------------------------%
423
c | ITRY is the loop variable that controls the |
424
c | maximum amount of times that a restart is |
425
c | attempted. NRSTRT is used by stat.h |
426
c %---------------------------------------------%
427
c
428
betaj = zero
429
nrstrt = nrstrt + 1
430
itry = 1
431
20 continue
432
rstart = .true.
433
ido = 0
434
30 continue
435
c
436
c %--------------------------------------%
437
c | If in reverse communication mode and |
438
c | RSTART = .true. flow returns here. |
439
c %--------------------------------------%
440
c
441
call psgetv0 ( comm, ido, bmat, itry, .false., n, j, v, ldv,
442
& resid, rnorm, ipntr, workd, workl, ierr)
443
if (ido .ne. 99) go to 9000
444
if (ierr .lt. 0) then
445
itry = itry + 1
446
if (itry .le. 3) go to 20
447
c
448
c %------------------------------------------------%
449
c | Give up after several restart attempts. |
450
c | Set INFO to the size of the invariant subspace |
451
c | which spans OP and exit. |
452
c %------------------------------------------------%
453
c
454
info = j - 1
455
call second (t1)
456
tnaitr = tnaitr + (t1 - t0)
457
ido = 99
458
go to 9000
459
end if
460
c
461
40 continue
462
c
463
c %---------------------------------------------------------%
464
c | STEP 2: v_{j} = r_{j-1}/rnorm and p_{j} = p_{j}/rnorm |
465
c | Note that p_{j} = B*r_{j-1}. In order to avoid overflow |
466
c | when reciprocating a small RNORM, test against lower |
467
c | machine bound. |
468
c %---------------------------------------------------------%
469
c
470
call scopy (n, resid, 1, v(1,j), 1)
471
if (rnorm .ge. unfl) then
472
temp1 = one / rnorm
473
call sscal (n, temp1, v(1,j), 1)
474
call sscal (n, temp1, workd(ipj), 1)
475
else
476
c
477
c %-----------------------------------------%
478
c | To scale both v_{j} and p_{j} carefully |
479
c | use LAPACK routine SLASCL |
480
c %-----------------------------------------%
481
c
482
call slascl ('General', i, i, rnorm, one, n, 1,
483
& v(1,j), n, infol)
484
call slascl ('General', i, i, rnorm, one, n, 1,
485
& workd(ipj), n, infol)
486
end if
487
c
488
c %------------------------------------------------------%
489
c | STEP 3: r_{j} = OP*v_{j}; Note that p_{j} = B*v_{j} |
490
c | Note that this is not quite yet r_{j}. See STEP 4 |
491
c %------------------------------------------------------%
492
c
493
step3 = .true.
494
nopx = nopx + 1
495
call second (t2)
496
call scopy (n, v(1,j), 1, workd(ivj), 1)
497
ipntr(1) = ivj
498
ipntr(2) = irj
499
ipntr(3) = ipj
500
ido = 1
501
c
502
c %-----------------------------------%
503
c | Exit in order to compute OP*v_{j} |
504
c %-----------------------------------%
505
c
506
go to 9000
507
50 continue
508
c
509
c %----------------------------------%
510
c | Back from reverse communication; |
511
c | WORKD(IRJ:IRJ+N-1) := OP*v_{j} |
512
c | if step3 = .true. |
513
c %----------------------------------%
514
c
515
call second (t3)
516
tmvopx = tmvopx + (t3 - t2)
517
518
step3 = .false.
519
c
520
c %------------------------------------------%
521
c | Put another copy of OP*v_{j} into RESID. |
522
c %------------------------------------------%
523
c
524
call scopy (n, workd(irj), 1, resid, 1)
525
c
526
c %---------------------------------------%
527
c | STEP 4: Finish extending the Arnoldi |
528
c | factorization to length j. |
529
c %---------------------------------------%
530
c
531
call second (t2)
532
if (bmat .eq. 'G') then
533
nbx = nbx + 1
534
step4 = .true.
535
ipntr(1) = irj
536
ipntr(2) = ipj
537
ido = 2
538
c
539
c %-------------------------------------%
540
c | Exit in order to compute B*OP*v_{j} |
541
c %-------------------------------------%
542
c
543
go to 9000
544
else if (bmat .eq. 'I') then
545
call scopy (n, resid, 1, workd(ipj), 1)
546
end if
547
60 continue
548
c
549
c %----------------------------------%
550
c | Back from reverse communication; |
551
c | WORKD(IPJ:IPJ+N-1) := B*OP*v_{j} |
552
c | if step4 = .true. |
553
c %----------------------------------%
554
c
555
if (bmat .eq. 'G') then
556
call second (t3)
557
tmvbx = tmvbx + (t3 - t2)
558
end if
559
c
560
step4 = .false.
561
c
562
c %-------------------------------------%
563
c | The following is needed for STEP 5. |
564
c | Compute the B-norm of OP*v_{j}. |
565
c %-------------------------------------%
566
c
567
if (bmat .eq. 'G') then
568
rnorm_buf = sdot (n, resid, 1, workd(ipj), 1)
569
call MPI_ALLREDUCE( rnorm_buf, wnorm, 1,
570
& MPI_REAL, MPI_SUM, comm, ierr )
571
wnorm = sqrt(abs(wnorm))
572
else if (bmat .eq. 'I') then
573
wnorm = psnorm2( comm, n, resid, 1 )
574
end if
575
c
576
c %-----------------------------------------%
577
c | Compute the j-th residual corresponding |
578
c | to the j step factorization. |
579
c | Use Classical Gram Schmidt and compute: |
580
c | w_{j} <- V_{j}^T * B * OP * v_{j} |
581
c | r_{j} <- OP*v_{j} - V_{j} * w_{j} |
582
c %-----------------------------------------%
583
c
584
c
585
c %------------------------------------------%
586
c | Compute the j Fourier coefficients w_{j} |
587
c | WORKD(IPJ:IPJ+N-1) contains B*OP*v_{j}. |
588
c %------------------------------------------%
589
c
590
call sgemv ('T', n, j, one, v, ldv, workd(ipj), 1,
591
& zero, workl, 1)
592
call MPI_ALLREDUCE( workl, h(1,j), j,
593
& MPI_REAL, MPI_SUM, comm, ierr)
594
c
595
c %--------------------------------------%
596
c | Orthogonalize r_{j} against V_{j}. |
597
c | RESID contains OP*v_{j}. See STEP 3. |
598
c %--------------------------------------%
599
c
600
call sgemv ('N', n, j, -one, v, ldv, h(1,j), 1,
601
& one, resid, 1)
602
c
603
if (j .gt. 1) h(j,j-1) = betaj
604
c
605
call second (t4)
606
c
607
orth1 = .true.
608
c
609
call second (t2)
610
if (bmat .eq. 'G') then
611
nbx = nbx + 1
612
call scopy (n, resid, 1, workd(irj), 1)
613
ipntr(1) = irj
614
ipntr(2) = ipj
615
ido = 2
616
c
617
c %----------------------------------%
618
c | Exit in order to compute B*r_{j} |
619
c %----------------------------------%
620
c
621
go to 9000
622
else if (bmat .eq. 'I') then
623
call scopy (n, resid, 1, workd(ipj), 1)
624
end if
625
70 continue
626
c
627
c %---------------------------------------------------%
628
c | Back from reverse communication if ORTH1 = .true. |
629
c | WORKD(IPJ:IPJ+N-1) := B*r_{j}. |
630
c %---------------------------------------------------%
631
c
632
if (bmat .eq. 'G') then
633
call second (t3)
634
tmvbx = tmvbx + (t3 - t2)
635
end if
636
c
637
orth1 = .false.
638
c
639
c %------------------------------%
640
c | Compute the B-norm of r_{j}. |
641
c %------------------------------%
642
c
643
if (bmat .eq. 'G') then
644
rnorm_buf = sdot (n, resid, 1, workd(ipj), 1)
645
call MPI_ALLREDUCE( rnorm_buf, rnorm, 1,
646
& MPI_REAL, MPI_SUM, comm, ierr )
647
rnorm = sqrt(abs(rnorm))
648
else if (bmat .eq. 'I') then
649
rnorm = psnorm2( comm, n, resid, 1 )
650
end if
651
c
652
c %-----------------------------------------------------------%
653
c | STEP 5: Re-orthogonalization / Iterative refinement phase |
654
c | Maximum NITER_ITREF tries. |
655
c | |
656
c | s = V_{j}^T * B * r_{j} |
657
c | r_{j} = r_{j} - V_{j}*s |
658
c | alphaj = alphaj + s_{j} |
659
c | |
660
c | The stopping criteria used for iterative refinement is |
661
c | discussed in Parlett's book SEP, page 107 and in Gragg & |
662
c | Reichel ACM TOMS paper; Algorithm 686, Dec. 1990. |
663
c | Determine if we need to correct the residual. The goal is |
664
c | to enforce ||v(:,1:j)^T * r_{j}|| .le. eps * || r_{j} || |
665
c | The following test determines whether the sine of the |
666
c | angle between OP*x and the computed residual is less |
667
c | than or equal to 0.717. |
668
c %-----------------------------------------------------------%
669
c
670
if (rnorm .gt. 0.717*wnorm) go to 100
671
iter = 0
672
nrorth = nrorth + 1
673
c
674
c %---------------------------------------------------%
675
c | Enter the Iterative refinement phase. If further |
676
c | refinement is necessary, loop back here. The loop |
677
c | variable is ITER. Perform a step of Classical |
678
c | Gram-Schmidt using all the Arnoldi vectors V_{j} |
679
c %---------------------------------------------------%
680
c
681
80 continue
682
c
683
if (msglvl .gt. 2) then
684
xtemp(1) = wnorm
685
xtemp(2) = rnorm
686
call psvout (comm, logfil, 2, xtemp, ndigit,
687
& '_naitr: re-orthonalization; wnorm and rnorm are')
688
call psvout (comm, logfil, j, h(1,j), ndigit,
689
& '_naitr: j-th column of H')
690
end if
691
c
692
c %----------------------------------------------------%
693
c | Compute V_{j}^T * B * r_{j}. |
694
c | WORKD(IRJ:IRJ+J-1) = v(:,1:J)'*WORKD(IPJ:IPJ+N-1). |
695
c %----------------------------------------------------%
696
c
697
call sgemv ('T', n, j, one, v, ldv, workd(ipj), 1,
698
& zero, workl(j+1), 1)
699
call MPI_ALLREDUCE( workl(j+1), workl(1), j,
700
& MPI_REAL, MPI_SUM, comm, ierr)
701
c
702
c %---------------------------------------------%
703
c | Compute the correction to the residual: |
704
c | r_{j} = r_{j} - V_{j} * WORKD(IRJ:IRJ+J-1). |
705
c | The correction to H is v(:,1:J)*H(1:J,1:J) |
706
c | + v(:,1:J)*WORKD(IRJ:IRJ+J-1)*e'_j. |
707
c %---------------------------------------------%
708
c
709
call sgemv ('N', n, j, -one, v, ldv, workl(1), 1,
710
& one, resid, 1)
711
call saxpy (j, one, workl(1), 1, h(1,j), 1)
712
c
713
orth2 = .true.
714
call second (t2)
715
if (bmat .eq. 'G') then
716
nbx = nbx + 1
717
call scopy (n, resid, 1, workd(irj), 1)
718
ipntr(1) = irj
719
ipntr(2) = ipj
720
ido = 2
721
c
722
c %-----------------------------------%
723
c | Exit in order to compute B*r_{j}. |
724
c | r_{j} is the corrected residual. |
725
c %-----------------------------------%
726
c
727
go to 9000
728
else if (bmat .eq. 'I') then
729
call scopy (n, resid, 1, workd(ipj), 1)
730
end if
731
90 continue
732
c
733
c %---------------------------------------------------%
734
c | Back from reverse communication if ORTH2 = .true. |
735
c %---------------------------------------------------%
736
c
737
if (bmat .eq. 'G') then
738
call second (t3)
739
tmvbx = tmvbx + (t3 - t2)
740
end if
741
c
742
c %-----------------------------------------------------%
743
c | Compute the B-norm of the corrected residual r_{j}. |
744
c %-----------------------------------------------------%
745
c
746
if (bmat .eq. 'G') then
747
rnorm_buf = sdot (n, resid, 1, workd(ipj), 1)
748
call MPI_ALLREDUCE( rnorm_buf, rnorm1, 1,
749
& MPI_REAL, MPI_SUM, comm, ierr )
750
rnorm1 = sqrt(abs(rnorm1))
751
else if (bmat .eq. 'I') then
752
rnorm1 = psnorm2( comm, n, resid, 1 )
753
end if
754
c
755
if (msglvl .gt. 0 .and. iter .gt. 0) then
756
call pivout (comm, logfil, 1, [j], ndigit,
757
& '_naitr: Iterative refinement for Arnoldi residual')
758
if (msglvl .gt. 2) then
759
xtemp(1) = rnorm
760
xtemp(2) = rnorm1
761
call psvout (comm, logfil, 2, xtemp, ndigit,
762
& '_naitr: iterative refinement ; rnorm and rnorm1 are')
763
end if
764
end if
765
c
766
c %-----------------------------------------%
767
c | Determine if we need to perform another |
768
c | step of re-orthogonalization. |
769
c %-----------------------------------------%
770
c
771
if (rnorm1 .gt. 0.717*rnorm) then
772
c
773
c %---------------------------------------%
774
c | No need for further refinement. |
775
c | The cosine of the angle between the |
776
c | corrected residual vector and the old |
777
c | residual vector is greater than 0.717 |
778
c | In other words the corrected residual |
779
c | and the old residual vector share an |
780
c | angle of less than arcCOS(0.717) |
781
c %---------------------------------------%
782
c
783
rnorm = rnorm1
784
c
785
else
786
c
787
c %-------------------------------------------%
788
c | Another step of iterative refinement step |
789
c | is required. NITREF is used by stat.h |
790
c %-------------------------------------------%
791
c
792
nitref = nitref + 1
793
rnorm = rnorm1
794
iter = iter + 1
795
if (iter .le. 1) go to 80
796
c
797
c %-------------------------------------------------%
798
c | Otherwise RESID is numerically in the span of V |
799
c %-------------------------------------------------%
800
c
801
do 95 jj = 1, n
802
resid(jj) = zero
803
95 continue
804
rnorm = zero
805
end if
806
c
807
c %----------------------------------------------%
808
c | Branch here directly if iterative refinement |
809
c | wasn't necessary or after at most NITER_REF |
810
c | steps of iterative refinement. |
811
c %----------------------------------------------%
812
c
813
100 continue
814
c
815
rstart = .false.
816
orth2 = .false.
817
c
818
call second (t5)
819
titref = titref + (t5 - t4)
820
c
821
c %------------------------------------%
822
c | STEP 6: Update j = j+1; Continue |
823
c %------------------------------------%
824
c
825
j = j + 1
826
if (j .gt. k+np) then
827
call second (t1)
828
tnaitr = tnaitr + (t1 - t0)
829
ido = 99
830
do 110 i = max(1,k), k+np-1
831
c
832
c %--------------------------------------------%
833
c | Check for splitting and deflation. |
834
c | Use a standard test as in the QR algorithm |
835
c | REFERENCE: LAPACK subroutine slahqr |
836
c %--------------------------------------------%
837
c
838
tst1 = abs( h( i, i ) ) + abs( h( i+1, i+1 ) )
839
if( tst1.eq.zero )
840
& tst1 = slanhs( '1', k+np, h, ldh, workd(n+1) )
841
if( abs( h( i+1,i ) ).le.max( ulp*tst1, smlnum ) )
842
& h(i+1,i) = zero
843
110 continue
844
c
845
if (msglvl .gt. 2) then
846
call psmout (comm, logfil, k+np, k+np, h, ldh, ndigit,
847
& '_naitr: Final upper Hessenberg matrix H of order K+NP')
848
end if
849
c
850
go to 9000
851
end if
852
c
853
c %--------------------------------------------------------%
854
c | Loop back to extend the factorization by another step. |
855
c %--------------------------------------------------------%
856
c
857
go to 1000
858
c
859
c %---------------------------------------------------------------%
860
c | |
861
c | E N D O F M A I N I T E R A T I O N L O O P |
862
c | |
863
c %---------------------------------------------------------------%
864
c
865
9000 continue
866
return
867
c
868
c %----------------%
869
c | End of psnaitr |
870
c %----------------%
871
c
872
end
873
874