Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
trixi-framework
GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
5590 views
1
2
# From here on, this file contains specializations of DG methods on the
3
# curved 3D meshes `StructuredMesh{3}, P4estMesh{3}` to the compressible
4
# Euler equations.
5
#
6
# The specialized methods contain relatively verbose and ugly code in the sense
7
# that we do not use the common "pointwise physics" interface of Trixi.jl.
8
# However, this is currently (November 2021) necessary to get a significant
9
# speed-up by using SIMD instructions via LoopVectorization.jl.
10
#
11
# TODO: SIMD. We could get even more speed-up if we did not need to permute
12
# array dimensions, i.e., if we changed the basic memory layout.
13
#
14
# We do not wrap this code in `@muladd begin ... end` block. Optimizations like
15
# this are handled automatically by LoopVectorization.jl.
16
17
# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
18
# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
19
# works efficiently here.
20
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
21
element,
22
MeshT::Type{<:Union{StructuredMesh{3},
23
P4estMesh{3}}},
24
have_nonconservative_terms::False,
25
equations::CompressibleEulerEquations3D,
26
volume_flux::typeof(flux_shima_etal_turbo),
27
dg::DGSEM, cache, alpha)
28
@unpack derivative_split = dg.basis
29
@unpack contravariant_vectors = cache.elements
30
31
# Create a temporary array that will be used to store the RHS with permuted
32
# indices `[i, j, k, v]` to allow using SIMD instructions.
33
# `StrideArray`s with purely static dimensions do not allocate on the heap.
34
du = StrideArray{eltype(u_cons)}(undef,
35
(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
36
StaticInt(nvariables(equations))))
37
38
# Convert conserved to primitive variables on the given `element`.
39
u_prim = StrideArray{eltype(u_cons)}(undef,
40
(ntuple(_ -> StaticInt(nnodes(dg)),
41
ndims(MeshT))...,
42
StaticInt(nvariables(equations))))
43
44
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
45
rho = u_cons[1, i, j, k, element]
46
rho_v1 = u_cons[2, i, j, k, element]
47
rho_v2 = u_cons[3, i, j, k, element]
48
rho_v3 = u_cons[4, i, j, k, element]
49
rho_e_total = u_cons[5, i, j, k, element]
50
51
v1 = rho_v1 / rho
52
v2 = rho_v2 / rho
53
v3 = rho_v3 / rho
54
p = (equations.gamma - 1) *
55
(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
56
57
u_prim[i, j, k, 1] = rho
58
u_prim[i, j, k, 2] = v1
59
u_prim[i, j, k, 3] = v2
60
u_prim[i, j, k, 4] = v3
61
u_prim[i, j, k, 5] = p
62
end
63
64
# x direction
65
# At first, we create new temporary arrays with permuted memory layout to
66
# allow using SIMD instructions along the first dimension (which is contiguous
67
# in memory).
68
du_permuted = StrideArray{eltype(u_cons)}(undef,
69
(StaticInt(nnodes(dg)^2),
70
StaticInt(nnodes(dg)),
71
StaticInt(nvariables(equations))))
72
73
u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
74
(StaticInt(nnodes(dg)^2),
75
StaticInt(nnodes(dg)),
76
StaticInt(nvariables(equations))))
77
78
@turbo for v in eachvariable(equations),
79
k in eachnode(dg),
80
j in eachnode(dg),
81
i in eachnode(dg)
82
83
jk = j + nnodes(dg) * (k - 1)
84
u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
85
end
86
fill!(du_permuted, zero(eltype(du_permuted)))
87
88
# We must also permute the contravariant vectors.
89
contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
90
(StaticInt(nnodes(dg)^2),
91
StaticInt(nnodes(dg)),
92
StaticInt(ndims(MeshT))))
93
94
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
95
jk = j + nnodes(dg) * (k - 1)
96
contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, element]
97
contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, element]
98
contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, element]
99
end
100
101
# Next, we basically inline the volume flux. To allow SIMD vectorization and
102
# still use the symmetry of the volume flux and the derivative matrix, we
103
# loop over the triangular part in an outer loop and use a plain inner loop.
104
for i in eachnode(dg), ii in (i + 1):nnodes(dg)
105
@turbo for jk in Base.OneTo(nnodes(dg)^2)
106
rho_ll = u_prim_permuted[jk, i, 1]
107
v1_ll = u_prim_permuted[jk, i, 2]
108
v2_ll = u_prim_permuted[jk, i, 3]
109
v3_ll = u_prim_permuted[jk, i, 4]
110
p_ll = u_prim_permuted[jk, i, 5]
111
112
rho_rr = u_prim_permuted[jk, ii, 1]
113
v1_rr = u_prim_permuted[jk, ii, 2]
114
v2_rr = u_prim_permuted[jk, ii, 3]
115
v3_rr = u_prim_permuted[jk, ii, 4]
116
p_rr = u_prim_permuted[jk, ii, 5]
117
118
normal_direction_1 = 0.5 * (contravariant_vectors_x[jk, i, 1] +
119
contravariant_vectors_x[jk, ii, 1])
120
normal_direction_2 = 0.5 * (contravariant_vectors_x[jk, i, 2] +
121
contravariant_vectors_x[jk, ii, 2])
122
normal_direction_3 = 0.5 * (contravariant_vectors_x[jk, i, 3] +
123
contravariant_vectors_x[jk, ii, 3])
124
125
v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
126
v3_ll * normal_direction_3
127
v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
128
v3_rr * normal_direction_3
129
130
# Compute required mean values
131
rho_avg = 0.5 * (rho_ll + rho_rr)
132
v1_avg = 0.5 * (v1_ll + v1_rr)
133
v2_avg = 0.5 * (v2_ll + v2_rr)
134
v3_avg = 0.5 * (v3_ll + v3_rr)
135
v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)
136
p_avg = 0.5 * (p_ll + p_rr)
137
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
138
139
# Calculate fluxes depending on normal_direction
140
f1 = rho_avg * v_dot_n_avg
141
f2 = f1 * v1_avg + p_avg * normal_direction_1
142
f3 = f1 * v2_avg + p_avg * normal_direction_2
143
f4 = f1 * v3_avg + p_avg * normal_direction_3
144
f5 = (f1 * velocity_square_avg +
145
p_avg * v_dot_n_avg * equations.inv_gamma_minus_one
146
+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
147
148
# Add scaled fluxes to RHS
149
factor_i = alpha * derivative_split[i, ii]
150
du_permuted[jk, i, 1] += factor_i * f1
151
du_permuted[jk, i, 2] += factor_i * f2
152
du_permuted[jk, i, 3] += factor_i * f3
153
du_permuted[jk, i, 4] += factor_i * f4
154
du_permuted[jk, i, 5] += factor_i * f5
155
156
factor_ii = alpha * derivative_split[ii, i]
157
du_permuted[jk, ii, 1] += factor_ii * f1
158
du_permuted[jk, ii, 2] += factor_ii * f2
159
du_permuted[jk, ii, 3] += factor_ii * f3
160
du_permuted[jk, ii, 4] += factor_ii * f4
161
du_permuted[jk, ii, 5] += factor_ii * f5
162
end
163
end
164
165
@turbo for v in eachvariable(equations),
166
k in eachnode(dg),
167
j in eachnode(dg),
168
i in eachnode(dg)
169
170
jk = j + nnodes(dg) * (k - 1)
171
du[i, j, k, v] = du_permuted[jk, i, v]
172
end
173
174
# y direction
175
# We must also permute the contravariant vectors.
176
contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
177
(StaticInt(nnodes(dg)),
178
StaticInt(nnodes(dg)),
179
StaticInt(nnodes(dg)),
180
StaticInt(ndims(MeshT))))
181
182
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
183
contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
184
contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, element]
185
contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, element]
186
end
187
188
# A possible permutation of array dimensions with improved opportunities for
189
# SIMD vectorization appeared to be slower than the direct version used here
190
# in preliminary numerical experiments on an AVX2 system.
191
for j in eachnode(dg), jj in (j + 1):nnodes(dg)
192
@turbo for k in eachnode(dg), i in eachnode(dg)
193
rho_ll = u_prim[i, j, k, 1]
194
v1_ll = u_prim[i, j, k, 2]
195
v2_ll = u_prim[i, j, k, 3]
196
v3_ll = u_prim[i, j, k, 4]
197
p_ll = u_prim[i, j, k, 5]
198
199
rho_rr = u_prim[i, jj, k, 1]
200
v1_rr = u_prim[i, jj, k, 2]
201
v2_rr = u_prim[i, jj, k, 3]
202
v3_rr = u_prim[i, jj, k, 4]
203
p_rr = u_prim[i, jj, k, 5]
204
205
normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, k, 1] +
206
contravariant_vectors_y[i, jj, k, 1])
207
normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, k, 2] +
208
contravariant_vectors_y[i, jj, k, 2])
209
normal_direction_3 = 0.5 * (contravariant_vectors_y[i, j, k, 3] +
210
contravariant_vectors_y[i, jj, k, 3])
211
212
v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
213
v3_ll * normal_direction_3
214
v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
215
v3_rr * normal_direction_3
216
217
# Compute required mean values
218
rho_avg = 0.5 * (rho_ll + rho_rr)
219
v1_avg = 0.5 * (v1_ll + v1_rr)
220
v2_avg = 0.5 * (v2_ll + v2_rr)
221
v3_avg = 0.5 * (v3_ll + v3_rr)
222
v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)
223
p_avg = 0.5 * (p_ll + p_rr)
224
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
225
226
# Calculate fluxes depending on normal_direction
227
f1 = rho_avg * v_dot_n_avg
228
f2 = f1 * v1_avg + p_avg * normal_direction_1
229
f3 = f1 * v2_avg + p_avg * normal_direction_2
230
f4 = f1 * v3_avg + p_avg * normal_direction_3
231
f5 = (f1 * velocity_square_avg +
232
p_avg * v_dot_n_avg * equations.inv_gamma_minus_one
233
+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
234
235
# Add scaled fluxes to RHS
236
factor_j = alpha * derivative_split[j, jj]
237
du[i, j, k, 1] += factor_j * f1
238
du[i, j, k, 2] += factor_j * f2
239
du[i, j, k, 3] += factor_j * f3
240
du[i, j, k, 4] += factor_j * f4
241
du[i, j, k, 5] += factor_j * f5
242
243
factor_jj = alpha * derivative_split[jj, j]
244
du[i, jj, k, 1] += factor_jj * f1
245
du[i, jj, k, 2] += factor_jj * f2
246
du[i, jj, k, 3] += factor_jj * f3
247
du[i, jj, k, 4] += factor_jj * f4
248
du[i, jj, k, 5] += factor_jj * f5
249
end
250
end
251
252
# z direction
253
# The memory layout is already optimal for SIMD vectorization in this loop.
254
# We just squeeze the first two dimensions to make the code slightly faster.
255
GC.@preserve u_prim begin
256
u_prim_reshaped = PtrArray(pointer(u_prim),
257
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
258
StaticInt(nvariables(equations))))
259
260
du_reshaped = PtrArray(pointer(du),
261
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
262
StaticInt(nvariables(equations))))
263
264
# We must also permute the contravariant vectors.
265
contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
266
(StaticInt(nnodes(dg)^2),
267
StaticInt(nnodes(dg)),
268
StaticInt(ndims(MeshT))))
269
270
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
271
ij = i + nnodes(dg) * (j - 1)
272
contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
273
element]
274
contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
275
element]
276
contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
277
element]
278
end
279
280
for k in eachnode(dg), kk in (k + 1):nnodes(dg)
281
@turbo for ij in Base.OneTo(nnodes(dg)^2)
282
rho_ll = u_prim_reshaped[ij, k, 1]
283
v1_ll = u_prim_reshaped[ij, k, 2]
284
v2_ll = u_prim_reshaped[ij, k, 3]
285
v3_ll = u_prim_reshaped[ij, k, 4]
286
p_ll = u_prim_reshaped[ij, k, 5]
287
288
rho_rr = u_prim_reshaped[ij, kk, 1]
289
v1_rr = u_prim_reshaped[ij, kk, 2]
290
v2_rr = u_prim_reshaped[ij, kk, 3]
291
v3_rr = u_prim_reshaped[ij, kk, 4]
292
p_rr = u_prim_reshaped[ij, kk, 5]
293
294
normal_direction_1 = 0.5 * (contravariant_vectors_z[ij, k, 1] +
295
contravariant_vectors_z[ij, kk, 1])
296
normal_direction_2 = 0.5 * (contravariant_vectors_z[ij, k, 2] +
297
contravariant_vectors_z[ij, kk, 2])
298
normal_direction_3 = 0.5 * (contravariant_vectors_z[ij, k, 3] +
299
contravariant_vectors_z[ij, kk, 3])
300
301
v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
302
v3_ll * normal_direction_3
303
v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
304
v3_rr * normal_direction_3
305
306
# Compute required mean values
307
rho_avg = 0.5 * (rho_ll + rho_rr)
308
v1_avg = 0.5 * (v1_ll + v1_rr)
309
v2_avg = 0.5 * (v2_ll + v2_rr)
310
v3_avg = 0.5 * (v3_ll + v3_rr)
311
v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)
312
p_avg = 0.5 * (p_ll + p_rr)
313
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
314
315
# Calculate fluxes depending on normal_direction
316
f1 = rho_avg * v_dot_n_avg
317
f2 = f1 * v1_avg + p_avg * normal_direction_1
318
f3 = f1 * v2_avg + p_avg * normal_direction_2
319
f4 = f1 * v3_avg + p_avg * normal_direction_3
320
f5 = (f1 * velocity_square_avg +
321
p_avg * v_dot_n_avg * equations.inv_gamma_minus_one
322
+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
323
324
# Add scaled fluxes to RHS
325
factor_k = alpha * derivative_split[k, kk]
326
du_reshaped[ij, k, 1] += factor_k * f1
327
du_reshaped[ij, k, 2] += factor_k * f2
328
du_reshaped[ij, k, 3] += factor_k * f3
329
du_reshaped[ij, k, 4] += factor_k * f4
330
du_reshaped[ij, k, 5] += factor_k * f5
331
332
factor_kk = alpha * derivative_split[kk, k]
333
du_reshaped[ij, kk, 1] += factor_kk * f1
334
du_reshaped[ij, kk, 2] += factor_kk * f2
335
du_reshaped[ij, kk, 3] += factor_kk * f3
336
du_reshaped[ij, kk, 4] += factor_kk * f4
337
du_reshaped[ij, kk, 5] += factor_kk * f5
338
end
339
end
340
end # GC.@preserve u_prim begin
341
342
# Finally, we add the temporary RHS computed here to the global RHS in the
343
# given `element`.
344
@turbo for v in eachvariable(equations),
345
k in eachnode(dg),
346
j in eachnode(dg),
347
i in eachnode(dg)
348
349
_du[v, i, j, k, element] += du[i, j, k, v]
350
end
351
end
352
353
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
354
element,
355
MeshT::Type{<:Union{StructuredMesh{3},
356
P4estMesh{3}}},
357
have_nonconservative_terms::False,
358
equations::CompressibleEulerEquations3D,
359
volume_flux::typeof(flux_ranocha_turbo),
360
dg::DGSEM, cache, alpha)
361
@unpack derivative_split = dg.basis
362
@unpack contravariant_vectors = cache.elements
363
364
# Create a temporary array that will be used to store the RHS with permuted
365
# indices `[i, j, k, v]` to allow using SIMD instructions.
366
# `StrideArray`s with purely static dimensions do not allocate on the heap.
367
du = StrideArray{eltype(u_cons)}(undef,
368
(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
369
StaticInt(nvariables(equations))))
370
371
# Convert conserved to primitive variables on the given `element`. In addition
372
# to the usual primitive variables, we also compute logarithms of the density
373
# and pressure to increase the performance of the required logarithmic mean
374
# values.
375
u_prim = StrideArray{eltype(u_cons)}(undef,
376
(ntuple(_ -> StaticInt(nnodes(dg)),
377
ndims(MeshT))...,
378
StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
379
380
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
381
rho = u_cons[1, i, j, k, element]
382
rho_v1 = u_cons[2, i, j, k, element]
383
rho_v2 = u_cons[3, i, j, k, element]
384
rho_v3 = u_cons[4, i, j, k, element]
385
rho_e_total = u_cons[5, i, j, k, element]
386
387
v1 = rho_v1 / rho
388
v2 = rho_v2 / rho
389
v3 = rho_v3 / rho
390
p = (equations.gamma - 1) *
391
(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
392
393
u_prim[i, j, k, 1] = rho
394
u_prim[i, j, k, 2] = v1
395
u_prim[i, j, k, 3] = v2
396
u_prim[i, j, k, 4] = v3
397
u_prim[i, j, k, 5] = p
398
u_prim[i, j, k, 6] = log(rho)
399
u_prim[i, j, k, 7] = log(p)
400
end
401
402
# x direction
403
# At first, we create new temporary arrays with permuted memory layout to
404
# allow using SIMD instructions along the first dimension (which is contiguous
405
# in memory).
406
du_permuted = StrideArray{eltype(u_cons)}(undef,
407
(StaticInt(nnodes(dg)^2),
408
StaticInt(nnodes(dg)),
409
StaticInt(nvariables(equations))))
410
411
u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
412
(StaticInt(nnodes(dg)^2),
413
StaticInt(nnodes(dg)),
414
StaticInt(nvariables(equations) + 2)))
415
416
@turbo for v in indices(u_prim, 4), # v in eachvariable(equations) misses +2 logs
417
k in eachnode(dg),
418
j in eachnode(dg),
419
i in eachnode(dg)
420
421
jk = j + nnodes(dg) * (k - 1)
422
u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
423
end
424
fill!(du_permuted, zero(eltype(du_permuted)))
425
426
# We must also permute the contravariant vectors.
427
contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
428
(StaticInt(nnodes(dg)^2),
429
StaticInt(nnodes(dg)),
430
StaticInt(ndims(MeshT))))
431
432
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
433
jk = j + nnodes(dg) * (k - 1)
434
contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, element]
435
contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, element]
436
contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, element]
437
end
438
439
# Next, we basically inline the volume flux. To allow SIMD vectorization and
440
# still use the symmetry of the volume flux and the derivative matrix, we
441
# loop over the triangular part in an outer loop and use a plain inner loop.
442
for i in eachnode(dg), ii in (i + 1):nnodes(dg)
443
@turbo for jk in Base.OneTo(nnodes(dg)^2)
444
rho_ll = u_prim_permuted[jk, i, 1]
445
v1_ll = u_prim_permuted[jk, i, 2]
446
v2_ll = u_prim_permuted[jk, i, 3]
447
v3_ll = u_prim_permuted[jk, i, 4]
448
p_ll = u_prim_permuted[jk, i, 5]
449
log_rho_ll = u_prim_permuted[jk, i, 6]
450
log_p_ll = u_prim_permuted[jk, i, 7]
451
452
rho_rr = u_prim_permuted[jk, ii, 1]
453
v1_rr = u_prim_permuted[jk, ii, 2]
454
v2_rr = u_prim_permuted[jk, ii, 3]
455
v3_rr = u_prim_permuted[jk, ii, 4]
456
p_rr = u_prim_permuted[jk, ii, 5]
457
log_rho_rr = u_prim_permuted[jk, ii, 6]
458
log_p_rr = u_prim_permuted[jk, ii, 7]
459
460
normal_direction_1 = 0.5 * (contravariant_vectors_x[jk, i, 1] +
461
contravariant_vectors_x[jk, ii, 1])
462
normal_direction_2 = 0.5 * (contravariant_vectors_x[jk, i, 2] +
463
contravariant_vectors_x[jk, ii, 2])
464
normal_direction_3 = 0.5 * (contravariant_vectors_x[jk, i, 3] +
465
contravariant_vectors_x[jk, ii, 3])
466
467
v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
468
v3_ll * normal_direction_3
469
v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
470
v3_rr * normal_direction_3
471
472
# Compute required mean values
473
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
474
# it efficiently. This is equivalent to
475
# rho_mean = ln_mean(rho_ll, rho_rr)
476
x1 = rho_ll
477
log_x1 = log_rho_ll
478
y1 = rho_rr
479
log_y1 = log_rho_rr
480
x1_plus_y1 = x1 + y1
481
y1_minus_x1 = y1 - x1
482
z1 = y1_minus_x1^2 / x1_plus_y1^2
483
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
484
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
485
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
486
487
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
488
# in exact arithmetic since
489
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
490
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
491
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
492
x2 = rho_ll * p_rr
493
log_x2 = log_rho_ll + log_p_rr
494
y2 = rho_rr * p_ll
495
log_y2 = log_rho_rr + log_p_ll
496
x2_plus_y2 = x2 + y2
497
y2_minus_x2 = y2 - x2
498
z2 = y2_minus_x2^2 / x2_plus_y2^2
499
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
500
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
501
inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
502
503
v1_avg = 0.5 * (v1_ll + v1_rr)
504
v2_avg = 0.5 * (v2_ll + v2_rr)
505
v3_avg = 0.5 * (v3_ll + v3_rr)
506
p_avg = 0.5 * (p_ll + p_rr)
507
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
508
509
# Calculate fluxes depending on normal_direction
510
f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
511
f2 = f1 * v1_avg + p_avg * normal_direction_1
512
f3 = f1 * v2_avg + p_avg * normal_direction_2
513
f4 = f1 * v3_avg + p_avg * normal_direction_3
514
f5 = (f1 *
515
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
516
+
517
0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
518
519
# Add scaled fluxes to RHS
520
factor_i = alpha * derivative_split[i, ii]
521
du_permuted[jk, i, 1] += factor_i * f1
522
du_permuted[jk, i, 2] += factor_i * f2
523
du_permuted[jk, i, 3] += factor_i * f3
524
du_permuted[jk, i, 4] += factor_i * f4
525
du_permuted[jk, i, 5] += factor_i * f5
526
527
factor_ii = alpha * derivative_split[ii, i]
528
du_permuted[jk, ii, 1] += factor_ii * f1
529
du_permuted[jk, ii, 2] += factor_ii * f2
530
du_permuted[jk, ii, 3] += factor_ii * f3
531
du_permuted[jk, ii, 4] += factor_ii * f4
532
du_permuted[jk, ii, 5] += factor_ii * f5
533
end
534
end
535
536
@turbo for v in eachvariable(equations),
537
k in eachnode(dg),
538
j in eachnode(dg),
539
i in eachnode(dg)
540
541
jk = j + nnodes(dg) * (k - 1)
542
du[i, j, k, v] = du_permuted[jk, i, v]
543
end
544
545
# y direction
546
# We must also permute the contravariant vectors.
547
contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
548
(StaticInt(nnodes(dg)),
549
StaticInt(nnodes(dg)),
550
StaticInt(nnodes(dg)),
551
StaticInt(ndims(MeshT))))
552
553
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
554
contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
555
contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, element]
556
contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, element]
557
end
558
559
# A possible permutation of array dimensions with improved opportunities for
560
# SIMD vectorization appeared to be slower than the direct version used here
561
# in preliminary numerical experiments on an AVX2 system.
562
for j in eachnode(dg), jj in (j + 1):nnodes(dg)
563
@turbo for k in eachnode(dg), i in eachnode(dg)
564
rho_ll = u_prim[i, j, k, 1]
565
v1_ll = u_prim[i, j, k, 2]
566
v2_ll = u_prim[i, j, k, 3]
567
v3_ll = u_prim[i, j, k, 4]
568
p_ll = u_prim[i, j, k, 5]
569
log_rho_ll = u_prim[i, j, k, 6]
570
log_p_ll = u_prim[i, j, k, 7]
571
572
rho_rr = u_prim[i, jj, k, 1]
573
v1_rr = u_prim[i, jj, k, 2]
574
v2_rr = u_prim[i, jj, k, 3]
575
v3_rr = u_prim[i, jj, k, 4]
576
p_rr = u_prim[i, jj, k, 5]
577
log_rho_rr = u_prim[i, jj, k, 6]
578
log_p_rr = u_prim[i, jj, k, 7]
579
580
normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, k, 1] +
581
contravariant_vectors_y[i, jj, k, 1])
582
normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, k, 2] +
583
contravariant_vectors_y[i, jj, k, 2])
584
normal_direction_3 = 0.5 * (contravariant_vectors_y[i, j, k, 3] +
585
contravariant_vectors_y[i, jj, k, 3])
586
587
v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
588
v3_ll * normal_direction_3
589
v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
590
v3_rr * normal_direction_3
591
592
# Compute required mean values
593
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
594
# it efficiently. This is equivalent to
595
# rho_mean = ln_mean(rho_ll, rho_rr)
596
x1 = rho_ll
597
log_x1 = log_rho_ll
598
y1 = rho_rr
599
log_y1 = log_rho_rr
600
x1_plus_y1 = x1 + y1
601
y1_minus_x1 = y1 - x1
602
z1 = y1_minus_x1^2 / x1_plus_y1^2
603
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
604
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
605
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
606
607
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
608
# in exact arithmetic since
609
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
610
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
611
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
612
x2 = rho_ll * p_rr
613
log_x2 = log_rho_ll + log_p_rr
614
y2 = rho_rr * p_ll
615
log_y2 = log_rho_rr + log_p_ll
616
x2_plus_y2 = x2 + y2
617
y2_minus_x2 = y2 - x2
618
z2 = y2_minus_x2^2 / x2_plus_y2^2
619
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
620
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
621
inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
622
623
v1_avg = 0.5 * (v1_ll + v1_rr)
624
v2_avg = 0.5 * (v2_ll + v2_rr)
625
v3_avg = 0.5 * (v3_ll + v3_rr)
626
p_avg = 0.5 * (p_ll + p_rr)
627
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
628
629
# Calculate fluxes depending on normal_direction
630
f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
631
f2 = f1 * v1_avg + p_avg * normal_direction_1
632
f3 = f1 * v2_avg + p_avg * normal_direction_2
633
f4 = f1 * v3_avg + p_avg * normal_direction_3
634
f5 = (f1 *
635
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
636
+
637
0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
638
639
# Add scaled fluxes to RHS
640
factor_j = alpha * derivative_split[j, jj]
641
du[i, j, k, 1] += factor_j * f1
642
du[i, j, k, 2] += factor_j * f2
643
du[i, j, k, 3] += factor_j * f3
644
du[i, j, k, 4] += factor_j * f4
645
du[i, j, k, 5] += factor_j * f5
646
647
factor_jj = alpha * derivative_split[jj, j]
648
du[i, jj, k, 1] += factor_jj * f1
649
du[i, jj, k, 2] += factor_jj * f2
650
du[i, jj, k, 3] += factor_jj * f3
651
du[i, jj, k, 4] += factor_jj * f4
652
du[i, jj, k, 5] += factor_jj * f5
653
end
654
end
655
656
# z direction
657
# The memory layout is already optimal for SIMD vectorization in this loop.
658
# We just squeeze the first two dimensions to make the code slightly faster.
659
GC.@preserve u_prim begin
660
u_prim_reshaped = PtrArray(pointer(u_prim),
661
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
662
StaticInt(nvariables(equations) + 2)))
663
664
du_reshaped = PtrArray(pointer(du),
665
(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
666
StaticInt(nvariables(equations))))
667
668
# We must also permute the contravariant vectors.
669
contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
670
(StaticInt(nnodes(dg)^2),
671
StaticInt(nnodes(dg)),
672
StaticInt(ndims(MeshT))))
673
674
@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
675
ij = i + nnodes(dg) * (j - 1)
676
contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
677
element]
678
contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
679
element]
680
contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
681
element]
682
end
683
684
for k in eachnode(dg), kk in (k + 1):nnodes(dg)
685
@turbo for ij in Base.OneTo(nnodes(dg)^2)
686
rho_ll = u_prim_reshaped[ij, k, 1]
687
v1_ll = u_prim_reshaped[ij, k, 2]
688
v2_ll = u_prim_reshaped[ij, k, 3]
689
v3_ll = u_prim_reshaped[ij, k, 4]
690
p_ll = u_prim_reshaped[ij, k, 5]
691
log_rho_ll = u_prim_reshaped[ij, k, 6]
692
log_p_ll = u_prim_reshaped[ij, k, 7]
693
694
rho_rr = u_prim_reshaped[ij, kk, 1]
695
v1_rr = u_prim_reshaped[ij, kk, 2]
696
v2_rr = u_prim_reshaped[ij, kk, 3]
697
v3_rr = u_prim_reshaped[ij, kk, 4]
698
p_rr = u_prim_reshaped[ij, kk, 5]
699
log_rho_rr = u_prim_reshaped[ij, kk, 6]
700
log_p_rr = u_prim_reshaped[ij, kk, 7]
701
702
normal_direction_1 = 0.5 * (contravariant_vectors_z[ij, k, 1] +
703
contravariant_vectors_z[ij, kk, 1])
704
normal_direction_2 = 0.5 * (contravariant_vectors_z[ij, k, 2] +
705
contravariant_vectors_z[ij, kk, 2])
706
normal_direction_3 = 0.5 * (contravariant_vectors_z[ij, k, 3] +
707
contravariant_vectors_z[ij, kk, 3])
708
709
v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
710
v3_ll * normal_direction_3
711
v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
712
v3_rr * normal_direction_3
713
714
# Compute required mean values
715
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
716
# it efficiently. This is equivalent to
717
# rho_mean = ln_mean(rho_ll, rho_rr)
718
x1 = rho_ll
719
log_x1 = log_rho_ll
720
y1 = rho_rr
721
log_y1 = log_rho_rr
722
x1_plus_y1 = x1 + y1
723
y1_minus_x1 = y1 - x1
724
z1 = y1_minus_x1^2 / x1_plus_y1^2
725
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
726
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
727
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
728
729
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
730
# in exact arithmetic since
731
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
732
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
733
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
734
x2 = rho_ll * p_rr
735
log_x2 = log_rho_ll + log_p_rr
736
y2 = rho_rr * p_ll
737
log_y2 = log_rho_rr + log_p_ll
738
x2_plus_y2 = x2 + y2
739
y2_minus_x2 = y2 - x2
740
z2 = y2_minus_x2^2 / x2_plus_y2^2
741
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
742
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
743
inv_rho_p_mean = p_ll * p_rr *
744
ifelse(z2 < 1.0e-4, special_path2, regular_path2)
745
746
v1_avg = 0.5 * (v1_ll + v1_rr)
747
v2_avg = 0.5 * (v2_ll + v2_rr)
748
v3_avg = 0.5 * (v3_ll + v3_rr)
749
p_avg = 0.5 * (p_ll + p_rr)
750
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
751
752
# Calculate fluxes depending on normal_direction
753
f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
754
f2 = f1 * v1_avg + p_avg * normal_direction_1
755
f3 = f1 * v2_avg + p_avg * normal_direction_2
756
f4 = f1 * v3_avg + p_avg * normal_direction_3
757
f5 = (f1 *
758
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
759
+
760
0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
761
762
# Add scaled fluxes to RHS
763
factor_k = alpha * derivative_split[k, kk]
764
du_reshaped[ij, k, 1] += factor_k * f1
765
du_reshaped[ij, k, 2] += factor_k * f2
766
du_reshaped[ij, k, 3] += factor_k * f3
767
du_reshaped[ij, k, 4] += factor_k * f4
768
du_reshaped[ij, k, 5] += factor_k * f5
769
770
factor_kk = alpha * derivative_split[kk, k]
771
du_reshaped[ij, kk, 1] += factor_kk * f1
772
du_reshaped[ij, kk, 2] += factor_kk * f2
773
du_reshaped[ij, kk, 3] += factor_kk * f3
774
du_reshaped[ij, kk, 4] += factor_kk * f4
775
du_reshaped[ij, kk, 5] += factor_kk * f5
776
end
777
end
778
end # GC.@preserve u_prim begin
779
780
# Finally, we add the temporary RHS computed here to the global RHS in the
781
# given `element`.
782
@turbo for v in eachvariable(equations),
783
k in eachnode(dg),
784
j in eachnode(dg),
785
i in eachnode(dg)
786
787
_du[v, i, j, k, element] += du[i, j, k, v]
788
end
789
end
790
791