Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
trixi-framework
GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
5591 views
1
2
# From here on, this file contains specializations of DG methods on the
3
# TreeMesh3D to the compressible Euler equations.
4
#
5
# The specialized methods contain relatively verbose and ugly code in the sense
6
# that we do not use the common "pointwise physics" interface of Trixi.jl.
7
# However, this is currently (November 2021) necessary to get a significant
8
# speed-up by using SIMD instructions via LoopVectorization.jl.
9
#
10
# TODO: SIMD. We could get even more speed-up if we did not need to permute
11
# array dimensions, i.e., if we changed the basic memory layout.
12
#
13
# We do not wrap this code in `@muladd begin ... end` block. Optimizations like
14
# this are handled automatically by LoopVectorization.jl.
15
16
# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
17
# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
18
# works efficiently here.
19
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
20
element, MeshT::Type{<:TreeMesh{3}},
21
have_nonconservative_terms::False,
22
equations::CompressibleEulerEquations3D,
23
volume_flux::typeof(flux_shima_etal_turbo),
24
dg::DGSEM, cache, alpha)
25
@unpack derivative_split = dg.basis
26
27
# Create a temporary array that will be used to store the RHS with permuted
28
# indices `[i, j, k, v]` to allow using SIMD instructions.
29
# `StrideArray`s with purely static dimensions do not allocate on the heap.
30
du = StrideArray{eltype(u_cons)}(undef,
31
(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
32
StaticInt(nvariables(equations))))
33
34
# Convert conserved to primitive variables on the given `element`.
35
u_prim = StrideArray{eltype(u_cons)}(undef,
36
(ntuple(_ -> StaticInt(nnodes(dg)),
37
ndims(MeshT))...,
38
StaticInt(nvariables(equations))))
39
40
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
41
rho = u_cons[1, i, j, k, element]
42
rho_v1 = u_cons[2, i, j, k, element]
43
rho_v2 = u_cons[3, i, j, k, element]
44
rho_v3 = u_cons[4, i, j, k, element]
45
rho_e_total = u_cons[5, i, j, k, element]
46
47
v1 = rho_v1 / rho
48
v2 = rho_v2 / rho
49
v3 = rho_v3 / rho
50
p = (equations.gamma - 1) *
51
(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
52
53
u_prim[i, j, k, 1] = rho
54
u_prim[i, j, k, 2] = v1
55
u_prim[i, j, k, 3] = v2
56
u_prim[i, j, k, 4] = v3
57
u_prim[i, j, k, 5] = p
58
end
59
60
# x direction
61
# At first, we create new temporary arrays with permuted memory layout to
62
# allow using SIMD instructions along the first dimension (which is contiguous
63
# in memory).
64
du_permuted = StrideArray{eltype(u_cons)}(undef,
65
(StaticInt(nnodes(dg)^2),
66
StaticInt(nnodes(dg)),
67
StaticInt(nvariables(equations))))
68
69
u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
70
(StaticInt(nnodes(dg)^2),
71
StaticInt(nnodes(dg)),
72
StaticInt(nvariables(equations))))
73
74
@turbo for v in eachvariable(equations),
75
k in eachnode(dg),
76
j in eachnode(dg),
77
i in eachnode(dg)
78
79
jk = j + nnodes(dg) * (k - 1)
80
u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
81
end
82
fill!(du_permuted, zero(eltype(du_permuted)))
83
84
# Next, we basically inline the volume flux. To allow SIMD vectorization and
85
# still use the symmetry of the volume flux and the derivative matrix, we
86
# loop over the triangular part in an outer loop and use a plain inner loop.
87
for i in eachnode(dg), ii in (i + 1):nnodes(dg)
88
@turbo for jk in Base.OneTo(nnodes(dg)^2)
89
rho_ll = u_prim_permuted[jk, i, 1]
90
v1_ll = u_prim_permuted[jk, i, 2]
91
v2_ll = u_prim_permuted[jk, i, 3]
92
v3_ll = u_prim_permuted[jk, i, 4]
93
p_ll = u_prim_permuted[jk, i, 5]
94
95
rho_rr = u_prim_permuted[jk, ii, 1]
96
v1_rr = u_prim_permuted[jk, ii, 2]
97
v2_rr = u_prim_permuted[jk, ii, 3]
98
v3_rr = u_prim_permuted[jk, ii, 4]
99
p_rr = u_prim_permuted[jk, ii, 5]
100
101
# Compute required mean values
102
rho_avg = 0.5 * (rho_ll + rho_rr)
103
v1_avg = 0.5 * (v1_ll + v1_rr)
104
v2_avg = 0.5 * (v2_ll + v2_rr)
105
v3_avg = 0.5 * (v3_ll + v3_rr)
106
p_avg = 0.5 * (p_ll + p_rr)
107
kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
108
pv1_avg = 0.5 * (p_ll * v1_rr + p_rr * v1_ll)
109
110
# Calculate fluxes depending on Cartesian orientation
111
f1 = rho_avg * v1_avg
112
f2 = f1 * v1_avg + p_avg
113
f3 = f1 * v2_avg
114
f4 = f1 * v3_avg
115
f5 = p_avg * v1_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv1_avg
116
117
# Add scaled fluxes to RHS
118
factor_i = alpha * derivative_split[i, ii]
119
du_permuted[jk, i, 1] += factor_i * f1
120
du_permuted[jk, i, 2] += factor_i * f2
121
du_permuted[jk, i, 3] += factor_i * f3
122
du_permuted[jk, i, 4] += factor_i * f4
123
du_permuted[jk, i, 5] += factor_i * f5
124
125
factor_ii = alpha * derivative_split[ii, i]
126
du_permuted[jk, ii, 1] += factor_ii * f1
127
du_permuted[jk, ii, 2] += factor_ii * f2
128
du_permuted[jk, ii, 3] += factor_ii * f3
129
du_permuted[jk, ii, 4] += factor_ii * f4
130
du_permuted[jk, ii, 5] += factor_ii * f5
131
end
132
end
133
134
@turbo for v in eachvariable(equations),
135
k in eachnode(dg),
136
j in eachnode(dg),
137
i in eachnode(dg)
138
139
jk = j + nnodes(dg) * (k - 1)
140
du[i, j, k, v] = du_permuted[jk, i, v]
141
end
142
143
# y direction
144
# A possible permutation of array dimensions with improved opportunities for
145
# SIMD vectorization appeared to be slower than the direct version used here
146
# in preliminary numerical experiments on an AVX2 system.
147
for j in eachnode(dg), jj in (j + 1):nnodes(dg)
148
@turbo for k in eachnode(dg), i in eachnode(dg)
149
rho_ll = u_prim[i, j, k, 1]
150
v1_ll = u_prim[i, j, k, 2]
151
v2_ll = u_prim[i, j, k, 3]
152
v3_ll = u_prim[i, j, k, 4]
153
p_ll = u_prim[i, j, k, 5]
154
155
rho_rr = u_prim[i, jj, k, 1]
156
v1_rr = u_prim[i, jj, k, 2]
157
v2_rr = u_prim[i, jj, k, 3]
158
v3_rr = u_prim[i, jj, k, 4]
159
p_rr = u_prim[i, jj, k, 5]
160
161
# Compute required mean values
162
rho_avg = 0.5 * (rho_ll + rho_rr)
163
v1_avg = 0.5 * (v1_ll + v1_rr)
164
v2_avg = 0.5 * (v2_ll + v2_rr)
165
v3_avg = 0.5 * (v3_ll + v3_rr)
166
p_avg = 0.5 * (p_ll + p_rr)
167
kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
168
pv2_avg = 0.5 * (p_ll * v2_rr + p_rr * v2_ll)
169
170
# Calculate fluxes depending on Cartesian orientation
171
f1 = rho_avg * v2_avg
172
f2 = f1 * v1_avg
173
f3 = f1 * v2_avg + p_avg
174
f4 = f1 * v3_avg
175
f5 = p_avg * v2_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv2_avg
176
177
# Add scaled fluxes to RHS
178
factor_j = alpha * derivative_split[j, jj]
179
du[i, j, k, 1] += factor_j * f1
180
du[i, j, k, 2] += factor_j * f2
181
du[i, j, k, 3] += factor_j * f3
182
du[i, j, k, 4] += factor_j * f4
183
du[i, j, k, 5] += factor_j * f5
184
185
factor_jj = alpha * derivative_split[jj, j]
186
du[i, jj, k, 1] += factor_jj * f1
187
du[i, jj, k, 2] += factor_jj * f2
188
du[i, jj, k, 3] += factor_jj * f3
189
du[i, jj, k, 4] += factor_jj * f4
190
du[i, jj, k, 5] += factor_jj * f5
191
end
192
end
193
194
# z direction
195
# The memory layout is already optimal for SIMD vectorization in this loop.
196
# We just squeeze the first two dimensions to make the code slightly faster.
197
GC.@preserve u_prim begin
198
u_prim_reshaped = PtrArray(pointer(u_prim),
199
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
200
StaticInt(nvariables(equations))))
201
202
du_reshaped = PtrArray(pointer(du),
203
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
204
StaticInt(nvariables(equations))))
205
206
for k in eachnode(dg), kk in (k + 1):nnodes(dg)
207
@turbo for ij in Base.OneTo(nnodes(dg)^2)
208
rho_ll = u_prim_reshaped[ij, k, 1]
209
v1_ll = u_prim_reshaped[ij, k, 2]
210
v2_ll = u_prim_reshaped[ij, k, 3]
211
v3_ll = u_prim_reshaped[ij, k, 4]
212
p_ll = u_prim_reshaped[ij, k, 5]
213
214
rho_rr = u_prim_reshaped[ij, kk, 1]
215
v1_rr = u_prim_reshaped[ij, kk, 2]
216
v2_rr = u_prim_reshaped[ij, kk, 3]
217
v3_rr = u_prim_reshaped[ij, kk, 4]
218
p_rr = u_prim_reshaped[ij, kk, 5]
219
220
# Compute required mean values
221
rho_avg = 0.5 * (rho_ll + rho_rr)
222
v1_avg = 0.5 * (v1_ll + v1_rr)
223
v2_avg = 0.5 * (v2_ll + v2_rr)
224
v3_avg = 0.5 * (v3_ll + v3_rr)
225
p_avg = 0.5 * (p_ll + p_rr)
226
kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
227
pv3_avg = 0.5 * (p_ll * v3_rr + p_rr * v3_ll)
228
229
# Calculate fluxes depending on Cartesian orientation
230
f1 = rho_avg * v3_avg
231
f2 = f1 * v1_avg
232
f3 = f1 * v2_avg
233
f4 = f1 * v3_avg + p_avg
234
f5 = p_avg * v3_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv3_avg
235
236
# Add scaled fluxes to RHS
237
factor_k = alpha * derivative_split[k, kk]
238
du_reshaped[ij, k, 1] += factor_k * f1
239
du_reshaped[ij, k, 2] += factor_k * f2
240
du_reshaped[ij, k, 3] += factor_k * f3
241
du_reshaped[ij, k, 4] += factor_k * f4
242
du_reshaped[ij, k, 5] += factor_k * f5
243
244
factor_kk = alpha * derivative_split[kk, k]
245
du_reshaped[ij, kk, 1] += factor_kk * f1
246
du_reshaped[ij, kk, 2] += factor_kk * f2
247
du_reshaped[ij, kk, 3] += factor_kk * f3
248
du_reshaped[ij, kk, 4] += factor_kk * f4
249
du_reshaped[ij, kk, 5] += factor_kk * f5
250
end
251
end
252
end # GC.@preserve u_prim
253
254
# Finally, we add the temporary RHS computed here to the global RHS in the
255
# given `element`.
256
@turbo for v in eachvariable(equations),
257
k in eachnode(dg),
258
j in eachnode(dg),
259
i in eachnode(dg)
260
261
_du[v, i, j, k, element] += du[i, j, k, v]
262
end
263
end
264
265
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
266
element, MeshT::Type{<:TreeMesh{3}},
267
have_nonconservative_terms::False,
268
equations::CompressibleEulerEquations3D,
269
volume_flux::typeof(flux_ranocha_turbo),
270
dg::DGSEM, cache, alpha)
271
@unpack derivative_split = dg.basis
272
273
# Create a temporary array that will be used to store the RHS with permuted
274
# indices `[i, j, k, v]` to allow using SIMD instructions.
275
# `StrideArray`s with purely static dimensions do not allocate on the heap.
276
du = StrideArray{eltype(u_cons)}(undef,
277
(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
278
StaticInt(nvariables(equations))))
279
280
# Convert conserved to primitive variables on the given `element`. In addition
281
# to the usual primitive variables, we also compute logarithms of the density
282
# and pressure to increase the performance of the required logarithmic mean
283
# values.
284
u_prim = StrideArray{eltype(u_cons)}(undef,
285
(ntuple(_ -> StaticInt(nnodes(dg)),
286
ndims(MeshT))...,
287
StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
288
289
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
290
rho = u_cons[1, i, j, k, element]
291
rho_v1 = u_cons[2, i, j, k, element]
292
rho_v2 = u_cons[3, i, j, k, element]
293
rho_v3 = u_cons[4, i, j, k, element]
294
rho_e_total = u_cons[5, i, j, k, element]
295
296
v1 = rho_v1 / rho
297
v2 = rho_v2 / rho
298
v3 = rho_v3 / rho
299
p = (equations.gamma - 1) *
300
(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
301
302
u_prim[i, j, k, 1] = rho
303
u_prim[i, j, k, 2] = v1
304
u_prim[i, j, k, 3] = v2
305
u_prim[i, j, k, 4] = v3
306
u_prim[i, j, k, 5] = p
307
u_prim[i, j, k, 6] = log(rho)
308
u_prim[i, j, k, 7] = log(p)
309
end
310
311
# x direction
312
# At first, we create new temporary arrays with permuted memory layout to
313
# allow using SIMD instructions along the first dimension (which is contiguous
314
# in memory).
315
du_permuted = StrideArray{eltype(u_cons)}(undef,
316
(StaticInt(nnodes(dg)^2),
317
StaticInt(nnodes(dg)),
318
StaticInt(nvariables(equations))))
319
320
u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
321
(StaticInt(nnodes(dg)^2),
322
StaticInt(nnodes(dg)),
323
StaticInt(nvariables(equations) + 2)))
324
325
@turbo for v in indices(u_prim, 4), # v in eachvariable(equations) misses +2 logs
326
k in eachnode(dg),
327
j in eachnode(dg),
328
i in eachnode(dg)
329
330
jk = j + nnodes(dg) * (k - 1)
331
u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
332
end
333
fill!(du_permuted, zero(eltype(du_permuted)))
334
335
# Next, we basically inline the volume flux. To allow SIMD vectorization and
336
# still use the symmetry of the volume flux and the derivative matrix, we
337
# loop over the triangular part in an outer loop and use a plain inner loop.
338
for i in eachnode(dg), ii in (i + 1):nnodes(dg)
339
@turbo for jk in Base.OneTo(nnodes(dg)^2)
340
rho_ll = u_prim_permuted[jk, i, 1]
341
v1_ll = u_prim_permuted[jk, i, 2]
342
v2_ll = u_prim_permuted[jk, i, 3]
343
v3_ll = u_prim_permuted[jk, i, 4]
344
p_ll = u_prim_permuted[jk, i, 5]
345
log_rho_ll = u_prim_permuted[jk, i, 6]
346
log_p_ll = u_prim_permuted[jk, i, 7]
347
348
rho_rr = u_prim_permuted[jk, ii, 1]
349
v1_rr = u_prim_permuted[jk, ii, 2]
350
v2_rr = u_prim_permuted[jk, ii, 3]
351
v3_rr = u_prim_permuted[jk, ii, 4]
352
p_rr = u_prim_permuted[jk, ii, 5]
353
log_rho_rr = u_prim_permuted[jk, ii, 6]
354
log_p_rr = u_prim_permuted[jk, ii, 7]
355
356
# Compute required mean values
357
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
358
# it efficiently. This is equivalent to
359
# rho_mean = ln_mean(rho_ll, rho_rr)
360
x1 = rho_ll
361
log_x1 = log_rho_ll
362
y1 = rho_rr
363
log_y1 = log_rho_rr
364
x1_plus_y1 = x1 + y1
365
y1_minus_x1 = y1 - x1
366
z1 = y1_minus_x1^2 / x1_plus_y1^2
367
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
368
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
369
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
370
371
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
372
# in exact arithmetic since
373
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
374
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
375
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
376
x2 = rho_ll * p_rr
377
log_x2 = log_rho_ll + log_p_rr
378
y2 = rho_rr * p_ll
379
log_y2 = log_rho_rr + log_p_ll
380
x2_plus_y2 = x2 + y2
381
y2_minus_x2 = y2 - x2
382
z2 = y2_minus_x2^2 / x2_plus_y2^2
383
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
384
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
385
inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
386
387
v1_avg = 0.5 * (v1_ll + v1_rr)
388
v2_avg = 0.5 * (v2_ll + v2_rr)
389
v3_avg = 0.5 * (v3_ll + v3_rr)
390
p_avg = 0.5 * (p_ll + p_rr)
391
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
392
393
# Calculate fluxes depending on Cartesian orientation
394
f1 = rho_mean * v1_avg
395
f2 = f1 * v1_avg + p_avg
396
f3 = f1 * v2_avg
397
f4 = f1 * v3_avg
398
f5 = f1 *
399
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
400
0.5 * (p_ll * v1_rr + p_rr * v1_ll)
401
402
# Add scaled fluxes to RHS
403
factor_i = alpha * derivative_split[i, ii]
404
du_permuted[jk, i, 1] += factor_i * f1
405
du_permuted[jk, i, 2] += factor_i * f2
406
du_permuted[jk, i, 3] += factor_i * f3
407
du_permuted[jk, i, 4] += factor_i * f4
408
du_permuted[jk, i, 5] += factor_i * f5
409
410
factor_ii = alpha * derivative_split[ii, i]
411
du_permuted[jk, ii, 1] += factor_ii * f1
412
du_permuted[jk, ii, 2] += factor_ii * f2
413
du_permuted[jk, ii, 3] += factor_ii * f3
414
du_permuted[jk, ii, 4] += factor_ii * f4
415
du_permuted[jk, ii, 5] += factor_ii * f5
416
end
417
end
418
419
@turbo for v in eachvariable(equations),
420
k in eachnode(dg),
421
j in eachnode(dg),
422
i in eachnode(dg)
423
424
jk = j + nnodes(dg) * (k - 1)
425
du[i, j, k, v] = du_permuted[jk, i, v]
426
end
427
428
# y direction
429
# A possible permutation of array dimensions with improved opportunities for
430
# SIMD vectorization appeared to be slower than the direct version used here
431
# in preliminary numerical experiments on an AVX2 system.
432
for j in eachnode(dg), jj in (j + 1):nnodes(dg)
433
@turbo for k in eachnode(dg), i in eachnode(dg)
434
rho_ll = u_prim[i, j, k, 1]
435
v1_ll = u_prim[i, j, k, 2]
436
v2_ll = u_prim[i, j, k, 3]
437
v3_ll = u_prim[i, j, k, 4]
438
p_ll = u_prim[i, j, k, 5]
439
log_rho_ll = u_prim[i, j, k, 6]
440
log_p_ll = u_prim[i, j, k, 7]
441
442
rho_rr = u_prim[i, jj, k, 1]
443
v1_rr = u_prim[i, jj, k, 2]
444
v2_rr = u_prim[i, jj, k, 3]
445
v3_rr = u_prim[i, jj, k, 4]
446
p_rr = u_prim[i, jj, k, 5]
447
log_rho_rr = u_prim[i, jj, k, 6]
448
log_p_rr = u_prim[i, jj, k, 7]
449
450
# Compute required mean values
451
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
452
# it efficiently. This is equivalent to
453
# rho_mean = ln_mean(rho_ll, rho_rr)
454
x1 = rho_ll
455
log_x1 = log_rho_ll
456
y1 = rho_rr
457
log_y1 = log_rho_rr
458
x1_plus_y1 = x1 + y1
459
y1_minus_x1 = y1 - x1
460
z1 = y1_minus_x1^2 / x1_plus_y1^2
461
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
462
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
463
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
464
465
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
466
# in exact arithmetic since
467
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
468
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
469
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
470
x2 = rho_ll * p_rr
471
log_x2 = log_rho_ll + log_p_rr
472
y2 = rho_rr * p_ll
473
log_y2 = log_rho_rr + log_p_ll
474
x2_plus_y2 = x2 + y2
475
y2_minus_x2 = y2 - x2
476
z2 = y2_minus_x2^2 / x2_plus_y2^2
477
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
478
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
479
inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
480
481
v1_avg = 0.5 * (v1_ll + v1_rr)
482
v2_avg = 0.5 * (v2_ll + v2_rr)
483
v3_avg = 0.5 * (v3_ll + v3_rr)
484
p_avg = 0.5 * (p_ll + p_rr)
485
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
486
487
# Calculate fluxes depending on Cartesian orientation
488
f1 = rho_mean * v2_avg
489
f2 = f1 * v1_avg
490
f3 = f1 * v2_avg + p_avg
491
f4 = f1 * v3_avg
492
f5 = f1 *
493
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
494
0.5 * (p_ll * v2_rr + p_rr * v2_ll)
495
496
# Add scaled fluxes to RHS
497
factor_j = alpha * derivative_split[j, jj]
498
du[i, j, k, 1] += factor_j * f1
499
du[i, j, k, 2] += factor_j * f2
500
du[i, j, k, 3] += factor_j * f3
501
du[i, j, k, 4] += factor_j * f4
502
du[i, j, k, 5] += factor_j * f5
503
504
factor_jj = alpha * derivative_split[jj, j]
505
du[i, jj, k, 1] += factor_jj * f1
506
du[i, jj, k, 2] += factor_jj * f2
507
du[i, jj, k, 3] += factor_jj * f3
508
du[i, jj, k, 4] += factor_jj * f4
509
du[i, jj, k, 5] += factor_jj * f5
510
end
511
end
512
513
# z direction
514
# The memory layout is already optimal for SIMD vectorization in this loop.
515
# We just squeeze the first two dimensions to make the code slightly faster.
516
GC.@preserve u_prim begin
517
u_prim_reshaped = PtrArray(pointer(u_prim),
518
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
519
StaticInt(nvariables(equations) + 2)))
520
521
du_reshaped = PtrArray(pointer(du),
522
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
523
StaticInt(nvariables(equations))))
524
525
for k in eachnode(dg), kk in (k + 1):nnodes(dg)
526
@turbo for ij in Base.OneTo(nnodes(dg)^2)
527
rho_ll = u_prim_reshaped[ij, k, 1]
528
v1_ll = u_prim_reshaped[ij, k, 2]
529
v2_ll = u_prim_reshaped[ij, k, 3]
530
v3_ll = u_prim_reshaped[ij, k, 4]
531
p_ll = u_prim_reshaped[ij, k, 5]
532
log_rho_ll = u_prim_reshaped[ij, k, 6]
533
log_p_ll = u_prim_reshaped[ij, k, 7]
534
535
rho_rr = u_prim_reshaped[ij, kk, 1]
536
v1_rr = u_prim_reshaped[ij, kk, 2]
537
v2_rr = u_prim_reshaped[ij, kk, 3]
538
v3_rr = u_prim_reshaped[ij, kk, 4]
539
p_rr = u_prim_reshaped[ij, kk, 5]
540
log_rho_rr = u_prim_reshaped[ij, kk, 6]
541
log_p_rr = u_prim_reshaped[ij, kk, 7]
542
543
# Compute required mean values
544
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
545
# it efficiently. This is equivalent to
546
# rho_mean = ln_mean(rho_ll, rho_rr)
547
x1 = rho_ll
548
log_x1 = log_rho_ll
549
y1 = rho_rr
550
log_y1 = log_rho_rr
551
x1_plus_y1 = x1 + y1
552
y1_minus_x1 = y1 - x1
553
z1 = y1_minus_x1^2 / x1_plus_y1^2
554
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
555
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
556
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
557
558
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
559
# in exact arithmetic since
560
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
561
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
562
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
563
x2 = rho_ll * p_rr
564
log_x2 = log_rho_ll + log_p_rr
565
y2 = rho_rr * p_ll
566
log_y2 = log_rho_rr + log_p_ll
567
x2_plus_y2 = x2 + y2
568
y2_minus_x2 = y2 - x2
569
z2 = y2_minus_x2^2 / x2_plus_y2^2
570
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
571
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
572
inv_rho_p_mean = p_ll * p_rr *
573
ifelse(z2 < 1.0e-4, special_path2, regular_path2)
574
575
v1_avg = 0.5 * (v1_ll + v1_rr)
576
v2_avg = 0.5 * (v2_ll + v2_rr)
577
v3_avg = 0.5 * (v3_ll + v3_rr)
578
p_avg = 0.5 * (p_ll + p_rr)
579
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
580
581
# Calculate fluxes depending on Cartesian orientation
582
f1 = rho_mean * v3_avg
583
f2 = f1 * v1_avg
584
f3 = f1 * v2_avg
585
f4 = f1 * v3_avg + p_avg
586
f5 = f1 *
587
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
588
0.5 * (p_ll * v3_rr + p_rr * v3_ll)
589
590
# Add scaled fluxes to RHS
591
factor_k = alpha * derivative_split[k, kk]
592
du_reshaped[ij, k, 1] += factor_k * f1
593
du_reshaped[ij, k, 2] += factor_k * f2
594
du_reshaped[ij, k, 3] += factor_k * f3
595
du_reshaped[ij, k, 4] += factor_k * f4
596
du_reshaped[ij, k, 5] += factor_k * f5
597
598
factor_kk = alpha * derivative_split[kk, k]
599
du_reshaped[ij, kk, 1] += factor_kk * f1
600
du_reshaped[ij, kk, 2] += factor_kk * f2
601
du_reshaped[ij, kk, 3] += factor_kk * f3
602
du_reshaped[ij, kk, 4] += factor_kk * f4
603
du_reshaped[ij, kk, 5] += factor_kk * f5
604
end
605
end
606
end # GC.@preserve u_prim
607
608
# Finally, we add the temporary RHS computed here to the global RHS in the
609
# given `element`.
610
@turbo for v in eachvariable(equations),
611
k in eachnode(dg),
612
j in eachnode(dg),
613
i in eachnode(dg)
614
615
_du[v, i, j, k, element] += du[i, j, k, v]
616
end
617
end
618
619