Book a Demo!
CoCalc Logo Icon
StoreFeaturesDocsShareSupportNewsAboutPoliciesSign UpSign In
trixi-framework
GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
5591 views
1
# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
2
# Since these FMAs can increase the performance of many numerical algorithms,
3
# we need to opt-in explicitly.
4
# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
5
@muladd begin
6
#! format: noindent
7
8
# Calculate the vorticity on a single node using the derivative matrix from the polynomial basis of
9
# a DGSEM solver. `u` is the solution on the whole domain.
10
# This function is used for calculating acoustic source terms for coupled Euler-acoustics
11
# simulations.
12
function calc_vorticity_node(u, mesh::TreeMesh{2},
13
equations::CompressibleEulerEquations2D,
14
dg::DGSEM, cache, i, j, element)
15
@unpack derivative_matrix = dg.basis
16
17
v2_x = zero(eltype(u)) # derivative of v2 in x direction
18
for ii in eachnode(dg)
19
rho, _, rho_v2 = get_node_vars(u, equations, dg, ii, j, element)
20
v2 = rho_v2 / rho
21
v2_x = v2_x + derivative_matrix[i, ii] * v2
22
end
23
24
v1_y = zero(eltype(u)) # derivative of v1 in y direction
25
for jj in eachnode(dg)
26
rho, rho_v1 = get_node_vars(u, equations, dg, i, jj, element)
27
v1 = rho_v1 / rho
28
v1_y = v1_y + derivative_matrix[j, jj] * v1
29
end
30
31
return (v2_x - v1_y) * cache.elements.inverse_jacobian[element]
32
end
33
34
# Convenience function for calculating the vorticity on the whole domain and storing it in a
35
# preallocated array
36
function calc_vorticity!(vorticity, u, mesh::TreeMesh{2},
37
equations::CompressibleEulerEquations2D,
38
dg::DGSEM, cache)
39
@threaded for element in eachelement(dg, cache)
40
for j in eachnode(dg), i in eachnode(dg)
41
vorticity[i, j, element] = calc_vorticity_node(u, mesh, equations, dg,
42
cache, i, j, element)
43
end
44
end
45
46
return nothing
47
end
48
end # muladd
49
50
# From here on, this file contains specializations of DG methods on the
51
# TreeMesh2D to the compressible Euler equations.
52
#
53
# The specialized methods contain relatively verbose and ugly code in the sense
54
# that we do not use the common "pointwise physics" interface of Trixi.jl.
55
# However, this is currently (November 2021) necessary to get a significant
56
# speed-up by using SIMD instructions via LoopVectorization.jl.
57
#
58
# TODO: SIMD. We could get even more speed-up if we did not need to permute
59
# array dimensions, i.e., if we changed the basic memory layout.
60
#
61
# We do not wrap this code in `@muladd begin ... end` block. Optimizations like
62
# this are handled automatically by LoopVectorization.jl.
63
64
# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
65
# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
66
# works efficiently here.
67
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
68
element, MeshT::Type{<:TreeMesh{2}},
69
have_nonconservative_terms::False,
70
equations::CompressibleEulerEquations2D,
71
volume_flux::typeof(flux_shima_etal_turbo),
72
dg::DGSEM, cache, alpha)
73
@unpack derivative_split = dg.basis
74
75
# Create a temporary array that will be used to store the RHS with permuted
76
# indices `[i, j, v]` to allow using SIMD instructions.
77
# `StrideArray`s with purely static dimensions do not allocate on the heap.
78
du = StrideArray{eltype(u_cons)}(undef,
79
(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
80
StaticInt(nvariables(equations))))
81
82
# Convert conserved to primitive variables on the given `element`.
83
u_prim = StrideArray{eltype(u_cons)}(undef,
84
(ntuple(_ -> StaticInt(nnodes(dg)),
85
ndims(MeshT))...,
86
StaticInt(nvariables(equations))))
87
88
@turbo for j in eachnode(dg), i in eachnode(dg)
89
rho = u_cons[1, i, j, element]
90
rho_v1 = u_cons[2, i, j, element]
91
rho_v2 = u_cons[3, i, j, element]
92
rho_e_total = u_cons[4, i, j, element]
93
94
v1 = rho_v1 / rho
95
v2 = rho_v2 / rho
96
p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))
97
98
u_prim[i, j, 1] = rho
99
u_prim[i, j, 2] = v1
100
u_prim[i, j, 3] = v2
101
u_prim[i, j, 4] = p
102
end
103
104
# x direction
105
# At first, we create new temporary arrays with permuted memory layout to
106
# allow using SIMD instructions along the first dimension (which is contiguous
107
# in memory).
108
du_permuted = StrideArray{eltype(u_cons)}(undef,
109
(StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
110
StaticInt(nvariables(equations))))
111
112
u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
113
(StaticInt(nnodes(dg)),
114
StaticInt(nnodes(dg)),
115
StaticInt(nvariables(equations))))
116
117
@turbo for v in eachvariable(equations),
118
j in eachnode(dg),
119
i in eachnode(dg)
120
121
u_prim_permuted[j, i, v] = u_prim[i, j, v]
122
end
123
fill!(du_permuted, zero(eltype(du_permuted)))
124
125
# Next, we basically inline the volume flux. To allow SIMD vectorization and
126
# still use the symmetry of the volume flux and the derivative matrix, we
127
# loop over the triangular part in an outer loop and use a plain inner loop.
128
for i in eachnode(dg), ii in (i + 1):nnodes(dg)
129
@turbo for j in eachnode(dg)
130
rho_ll = u_prim_permuted[j, i, 1]
131
v1_ll = u_prim_permuted[j, i, 2]
132
v2_ll = u_prim_permuted[j, i, 3]
133
p_ll = u_prim_permuted[j, i, 4]
134
135
rho_rr = u_prim_permuted[j, ii, 1]
136
v1_rr = u_prim_permuted[j, ii, 2]
137
v2_rr = u_prim_permuted[j, ii, 3]
138
p_rr = u_prim_permuted[j, ii, 4]
139
140
# Compute required mean values
141
rho_avg = 0.5 * (rho_ll + rho_rr)
142
v1_avg = 0.5 * (v1_ll + v1_rr)
143
v2_avg = 0.5 * (v2_ll + v2_rr)
144
p_avg = 0.5 * (p_ll + p_rr)
145
kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
146
pv1_avg = 0.5 * (p_ll * v1_rr + p_rr * v1_ll)
147
148
# Calculate fluxes depending on Cartesian orientation
149
f1 = rho_avg * v1_avg
150
f2 = f1 * v1_avg + p_avg
151
f3 = f1 * v2_avg
152
f4 = p_avg * v1_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv1_avg
153
154
# Add scaled fluxes to RHS
155
factor_i = alpha * derivative_split[i, ii]
156
du_permuted[j, i, 1] += factor_i * f1
157
du_permuted[j, i, 2] += factor_i * f2
158
du_permuted[j, i, 3] += factor_i * f3
159
du_permuted[j, i, 4] += factor_i * f4
160
161
factor_ii = alpha * derivative_split[ii, i]
162
du_permuted[j, ii, 1] += factor_ii * f1
163
du_permuted[j, ii, 2] += factor_ii * f2
164
du_permuted[j, ii, 3] += factor_ii * f3
165
du_permuted[j, ii, 4] += factor_ii * f4
166
end
167
end
168
169
@turbo for v in eachvariable(equations),
170
j in eachnode(dg),
171
i in eachnode(dg)
172
173
du[i, j, v] = du_permuted[j, i, v]
174
end
175
176
# y direction
177
# The memory layout is already optimal for SIMD vectorization in this loop.
178
for j in eachnode(dg), jj in (j + 1):nnodes(dg)
179
@turbo for i in eachnode(dg)
180
rho_ll = u_prim[i, j, 1]
181
v1_ll = u_prim[i, j, 2]
182
v2_ll = u_prim[i, j, 3]
183
p_ll = u_prim[i, j, 4]
184
185
rho_rr = u_prim[i, jj, 1]
186
v1_rr = u_prim[i, jj, 2]
187
v2_rr = u_prim[i, jj, 3]
188
p_rr = u_prim[i, jj, 4]
189
190
# Compute required mean values
191
rho_avg = 0.5 * (rho_ll + rho_rr)
192
v1_avg = 0.5 * (v1_ll + v1_rr)
193
v2_avg = 0.5 * (v2_ll + v2_rr)
194
p_avg = 0.5 * (p_ll + p_rr)
195
kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
196
pv2_avg = 0.5 * (p_ll * v2_rr + p_rr * v2_ll)
197
198
# Calculate fluxes depending on Cartesian orientation
199
f1 = rho_avg * v2_avg
200
f2 = f1 * v1_avg
201
f3 = f1 * v2_avg + p_avg
202
f4 = p_avg * v2_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv2_avg
203
204
# Add scaled fluxes to RHS
205
factor_j = alpha * derivative_split[j, jj]
206
du[i, j, 1] += factor_j * f1
207
du[i, j, 2] += factor_j * f2
208
du[i, j, 3] += factor_j * f3
209
du[i, j, 4] += factor_j * f4
210
211
factor_jj = alpha * derivative_split[jj, j]
212
du[i, jj, 1] += factor_jj * f1
213
du[i, jj, 2] += factor_jj * f2
214
du[i, jj, 3] += factor_jj * f3
215
du[i, jj, 4] += factor_jj * f4
216
end
217
end
218
219
# Finally, we add the temporary RHS computed here to the global RHS in the
220
# given `element`.
221
@turbo for v in eachvariable(equations),
222
j in eachnode(dg),
223
i in eachnode(dg)
224
225
_du[v, i, j, element] += du[i, j, v]
226
end
227
end
228
229
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
230
element, MeshT::Type{<:TreeMesh{2}},
231
have_nonconservative_terms::False,
232
equations::CompressibleEulerEquations2D,
233
volume_flux::typeof(flux_ranocha_turbo),
234
dg::DGSEM, cache, alpha)
235
@unpack derivative_split = dg.basis
236
237
# Create a temporary array that will be used to store the RHS with permuted
238
# indices `[i, j, v]` to allow using SIMD instructions.
239
# `StrideArray`s with purely static dimensions do not allocate on the heap.
240
du = StrideArray{eltype(u_cons)}(undef,
241
(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
242
StaticInt(nvariables(equations))))
243
244
# Convert conserved to primitive variables on the given `element`. In addition
245
# to the usual primitive variables, we also compute logarithms of the density
246
# and pressure to increase the performance of the required logarithmic mean
247
# values.
248
u_prim = StrideArray{eltype(u_cons)}(undef,
249
(ntuple(_ -> StaticInt(nnodes(dg)),
250
ndims(MeshT))...,
251
StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
252
253
@turbo for j in eachnode(dg), i in eachnode(dg)
254
rho = u_cons[1, i, j, element]
255
rho_v1 = u_cons[2, i, j, element]
256
rho_v2 = u_cons[3, i, j, element]
257
rho_e_total = u_cons[4, i, j, element]
258
259
v1 = rho_v1 / rho
260
v2 = rho_v2 / rho
261
p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))
262
263
u_prim[i, j, 1] = rho
264
u_prim[i, j, 2] = v1
265
u_prim[i, j, 3] = v2
266
u_prim[i, j, 4] = p
267
u_prim[i, j, 5] = log(rho)
268
u_prim[i, j, 6] = log(p)
269
end
270
271
# x direction
272
# At first, we create new temporary arrays with permuted memory layout to
273
# allow using SIMD instructions along the first dimension (which is contiguous
274
# in memory).
275
du_permuted = StrideArray{eltype(u_cons)}(undef,
276
(StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
277
StaticInt(nvariables(equations))))
278
279
u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
280
(StaticInt(nnodes(dg)),
281
StaticInt(nnodes(dg)),
282
StaticInt(nvariables(equations) + 2)))
283
284
@turbo for v in indices(u_prim, 3), # v in eachvariable(equations) misses +2 logs
285
j in eachnode(dg),
286
i in eachnode(dg)
287
288
u_prim_permuted[j, i, v] = u_prim[i, j, v]
289
end
290
fill!(du_permuted, zero(eltype(du_permuted)))
291
292
# Next, we basically inline the volume flux. To allow SIMD vectorization and
293
# still use the symmetry of the volume flux and the derivative matrix, we
294
# loop over the triangular part in an outer loop and use a plain inner loop.
295
for i in eachnode(dg), ii in (i + 1):nnodes(dg)
296
@turbo for j in eachnode(dg)
297
rho_ll = u_prim_permuted[j, i, 1]
298
v1_ll = u_prim_permuted[j, i, 2]
299
v2_ll = u_prim_permuted[j, i, 3]
300
p_ll = u_prim_permuted[j, i, 4]
301
log_rho_ll = u_prim_permuted[j, i, 5]
302
log_p_ll = u_prim_permuted[j, i, 6]
303
304
rho_rr = u_prim_permuted[j, ii, 1]
305
v1_rr = u_prim_permuted[j, ii, 2]
306
v2_rr = u_prim_permuted[j, ii, 3]
307
p_rr = u_prim_permuted[j, ii, 4]
308
log_rho_rr = u_prim_permuted[j, ii, 5]
309
log_p_rr = u_prim_permuted[j, ii, 6]
310
311
# Compute required mean values
312
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
313
# it efficiently. This is equivalent to
314
# rho_mean = ln_mean(rho_ll, rho_rr)
315
x1 = rho_ll
316
log_x1 = log_rho_ll
317
y1 = rho_rr
318
log_y1 = log_rho_rr
319
x1_plus_y1 = x1 + y1
320
y1_minus_x1 = y1 - x1
321
z1 = y1_minus_x1^2 / x1_plus_y1^2
322
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
323
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
324
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
325
326
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
327
# in exact arithmetic since
328
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
329
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
330
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
331
x2 = rho_ll * p_rr
332
log_x2 = log_rho_ll + log_p_rr
333
y2 = rho_rr * p_ll
334
log_y2 = log_rho_rr + log_p_ll
335
x2_plus_y2 = x2 + y2
336
y2_minus_x2 = y2 - x2
337
z2 = y2_minus_x2^2 / x2_plus_y2^2
338
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
339
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
340
inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
341
342
v1_avg = 0.5 * (v1_ll + v1_rr)
343
v2_avg = 0.5 * (v2_ll + v2_rr)
344
p_avg = 0.5 * (p_ll + p_rr)
345
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
346
347
# Calculate fluxes depending on Cartesian orientation
348
f1 = rho_mean * v1_avg
349
f2 = f1 * v1_avg + p_avg
350
f3 = f1 * v2_avg
351
f4 = f1 *
352
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
353
0.5 * (p_ll * v1_rr + p_rr * v1_ll)
354
355
# Add scaled fluxes to RHS
356
factor_i = alpha * derivative_split[i, ii]
357
du_permuted[j, i, 1] += factor_i * f1
358
du_permuted[j, i, 2] += factor_i * f2
359
du_permuted[j, i, 3] += factor_i * f3
360
du_permuted[j, i, 4] += factor_i * f4
361
362
factor_ii = alpha * derivative_split[ii, i]
363
du_permuted[j, ii, 1] += factor_ii * f1
364
du_permuted[j, ii, 2] += factor_ii * f2
365
du_permuted[j, ii, 3] += factor_ii * f3
366
du_permuted[j, ii, 4] += factor_ii * f4
367
end
368
end
369
370
@turbo for v in eachvariable(equations),
371
j in eachnode(dg),
372
i in eachnode(dg)
373
374
du[i, j, v] = du_permuted[j, i, v]
375
end
376
377
# y direction
378
# The memory layout is already optimal for SIMD vectorization in this loop.
379
for j in eachnode(dg), jj in (j + 1):nnodes(dg)
380
@turbo for i in eachnode(dg)
381
rho_ll = u_prim[i, j, 1]
382
v1_ll = u_prim[i, j, 2]
383
v2_ll = u_prim[i, j, 3]
384
p_ll = u_prim[i, j, 4]
385
log_rho_ll = u_prim[i, j, 5]
386
log_p_ll = u_prim[i, j, 6]
387
388
rho_rr = u_prim[i, jj, 1]
389
v1_rr = u_prim[i, jj, 2]
390
v2_rr = u_prim[i, jj, 3]
391
p_rr = u_prim[i, jj, 4]
392
log_rho_rr = u_prim[i, jj, 5]
393
log_p_rr = u_prim[i, jj, 6]
394
395
# Compute required mean values
396
# We inline the logarithmic mean to allow LoopVectorization.jl to optimize
397
# it efficiently. This is equivalent to
398
# rho_mean = ln_mean(rho_ll, rho_rr)
399
x1 = rho_ll
400
log_x1 = log_rho_ll
401
y1 = rho_rr
402
log_y1 = log_rho_rr
403
x1_plus_y1 = x1 + y1
404
y1_minus_x1 = y1 - x1
405
z1 = y1_minus_x1^2 / x1_plus_y1^2
406
special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
407
regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
408
rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
409
410
# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
411
# in exact arithmetic since
412
# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
413
# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
414
# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
415
x2 = rho_ll * p_rr
416
log_x2 = log_rho_ll + log_p_rr
417
y2 = rho_rr * p_ll
418
log_y2 = log_rho_rr + log_p_ll
419
x2_plus_y2 = x2 + y2
420
y2_minus_x2 = y2 - x2
421
z2 = y2_minus_x2^2 / x2_plus_y2^2
422
special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
423
regular_path2 = (log_y2 - log_x2) / y2_minus_x2
424
inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
425
426
v1_avg = 0.5 * (v1_ll + v1_rr)
427
v2_avg = 0.5 * (v2_ll + v2_rr)
428
p_avg = 0.5 * (p_ll + p_rr)
429
velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
430
431
# Calculate fluxes depending on Cartesian orientation
432
f1 = rho_mean * v2_avg
433
f2 = f1 * v1_avg
434
f3 = f1 * v2_avg + p_avg
435
f4 = f1 *
436
(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
437
0.5 * (p_ll * v2_rr + p_rr * v2_ll)
438
439
# Add scaled fluxes to RHS
440
factor_j = alpha * derivative_split[j, jj]
441
du[i, j, 1] += factor_j * f1
442
du[i, j, 2] += factor_j * f2
443
du[i, j, 3] += factor_j * f3
444
du[i, j, 4] += factor_j * f4
445
446
factor_jj = alpha * derivative_split[jj, j]
447
du[i, jj, 1] += factor_jj * f1
448
du[i, jj, 2] += factor_jj * f2
449
du[i, jj, 3] += factor_jj * f3
450
du[i, jj, 4] += factor_jj * f4
451
end
452
end
453
454
# Finally, we add the temporary RHS computed here to the global RHS in the
455
# given `element`.
456
@turbo for v in eachvariable(equations),
457
j in eachnode(dg),
458
i in eachnode(dg)
459
460
_du[v, i, j, element] += du[i, j, v]
461
end
462
end
463
464