CoCalc -- dg_2d_compressible

GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
⁵⁵⁹¹ views
1
# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
2
# Since these FMAs can increase the performance of many numerical algorithms,
3
# we need to opt-in explicitly.
4
# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.
5
@muladd begin
6
#! format: noindent
7

8
# Calculate the vorticity on a single node using the derivative matrix from the polynomial basis of
9
# a DGSEM solver. `u` is the solution on the whole domain.
10
# This function is used for calculating acoustic source terms for coupled Euler-acoustics
11
# simulations.
12
function calc_vorticity_node(u, mesh::TreeMesh{2},
13
                             equations::CompressibleEulerEquations2D,
14
                             dg::DGSEM, cache, i, j, element)
15
    @unpack derivative_matrix = dg.basis
16

17
    v2_x = zero(eltype(u)) # derivative of v2 in x direction
18
    for ii in eachnode(dg)
19
        rho, _, rho_v2 = get_node_vars(u, equations, dg, ii, j, element)
20
        v2 = rho_v2 / rho
21
        v2_x = v2_x + derivative_matrix[i, ii] * v2
22
    end
23

24
    v1_y = zero(eltype(u)) # derivative of v1 in y direction
25
    for jj in eachnode(dg)
26
        rho, rho_v1 = get_node_vars(u, equations, dg, i, jj, element)
27
        v1 = rho_v1 / rho
28
        v1_y = v1_y + derivative_matrix[j, jj] * v1
29
    end
30

31
    return (v2_x - v1_y) * cache.elements.inverse_jacobian[element]
32
end
33

34
# Convenience function for calculating the vorticity on the whole domain and storing it in a
35
# preallocated array
36
function calc_vorticity!(vorticity, u, mesh::TreeMesh{2},
37
                         equations::CompressibleEulerEquations2D,
38
                         dg::DGSEM, cache)
39
    @threaded for element in eachelement(dg, cache)
40
        for j in eachnode(dg), i in eachnode(dg)
41
            vorticity[i, j, element] = calc_vorticity_node(u, mesh, equations, dg,
42
                                                           cache, i, j, element)
43
        end
44
    end
45

46
    return nothing
47
end
48
end # muladd
49

50
# From here on, this file contains specializations of DG methods on the
51
# TreeMesh2D to the compressible Euler equations.
52
#
53
# The specialized methods contain relatively verbose and ugly code in the sense
54
# that we do not use the common "pointwise physics" interface of Trixi.jl.
55
# However, this is currently (November 2021) necessary to get a significant
56
# speed-up by using SIMD instructions via LoopVectorization.jl.
57
#
58
# TODO: SIMD. We could get even more speed-up if we did not need to permute
59
#             array dimensions, i.e., if we changed the basic memory layout.
60
#
61
# We do not wrap this code in `@muladd begin ... end` block. Optimizations like
62
# this are handled automatically by LoopVectorization.jl.
63

64
# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
65
# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
66
# works efficiently here.
67
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
68
                                           element, MeshT::Type{<:TreeMesh{2}},
69
                                           have_nonconservative_terms::False,
70
                                           equations::CompressibleEulerEquations2D,
71
                                           volume_flux::typeof(flux_shima_etal_turbo),
72
                                           dg::DGSEM, cache, alpha)
73
    @unpack derivative_split = dg.basis
74

75
    # Create a temporary array that will be used to store the RHS with permuted
76
    # indices `[i, j, v]` to allow using SIMD instructions.
77
    # `StrideArray`s with purely static dimensions do not allocate on the heap.
78
    du = StrideArray{eltype(u_cons)}(undef,
79
                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
80
                                      StaticInt(nvariables(equations))))
81

82
    # Convert conserved to primitive variables on the given `element`.
83
    u_prim = StrideArray{eltype(u_cons)}(undef,
84
                                         (ntuple(_ -> StaticInt(nnodes(dg)),
85
                                                 ndims(MeshT))...,
86
                                          StaticInt(nvariables(equations))))
87

88
    @turbo for j in eachnode(dg), i in eachnode(dg)
89
        rho = u_cons[1, i, j, element]
90
        rho_v1 = u_cons[2, i, j, element]
91
        rho_v2 = u_cons[3, i, j, element]
92
        rho_e_total = u_cons[4, i, j, element]
93

94
        v1 = rho_v1 / rho
95
        v2 = rho_v2 / rho
96
        p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))
97

98
        u_prim[i, j, 1] = rho
99
        u_prim[i, j, 2] = v1
100
        u_prim[i, j, 3] = v2
101
        u_prim[i, j, 4] = p
102
    end
103

104
    # x direction
105
    # At first, we create new temporary arrays with permuted memory layout to
106
    # allow using SIMD instructions along the first dimension (which is contiguous
107
    # in memory).
108
    du_permuted = StrideArray{eltype(u_cons)}(undef,
109
                                              (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
110
                                               StaticInt(nvariables(equations))))
111

112
    u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
113
                                                  (StaticInt(nnodes(dg)),
114
                                                   StaticInt(nnodes(dg)),
115
                                                   StaticInt(nvariables(equations))))
116

117
    @turbo for v in eachvariable(equations),
118
               j in eachnode(dg),
119
               i in eachnode(dg)
120

121
        u_prim_permuted[j, i, v] = u_prim[i, j, v]
122
    end
123
    fill!(du_permuted, zero(eltype(du_permuted)))
124

125
    # Next, we basically inline the volume flux. To allow SIMD vectorization and
126
    # still use the symmetry of the volume flux and the derivative matrix, we
127
    # loop over the triangular part in an outer loop and use a plain inner loop.
128
    for i in eachnode(dg), ii in (i + 1):nnodes(dg)
129
        @turbo for j in eachnode(dg)
130
            rho_ll = u_prim_permuted[j, i, 1]
131
            v1_ll = u_prim_permuted[j, i, 2]
132
            v2_ll = u_prim_permuted[j, i, 3]
133
            p_ll = u_prim_permuted[j, i, 4]
134

135
            rho_rr = u_prim_permuted[j, ii, 1]
136
            v1_rr = u_prim_permuted[j, ii, 2]
137
            v2_rr = u_prim_permuted[j, ii, 3]
138
            p_rr = u_prim_permuted[j, ii, 4]
139

140
            # Compute required mean values
141
            rho_avg = 0.5 * (rho_ll + rho_rr)
142
            v1_avg = 0.5 * (v1_ll + v1_rr)
143
            v2_avg = 0.5 * (v2_ll + v2_rr)
144
            p_avg = 0.5 * (p_ll + p_rr)
145
            kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
146
            pv1_avg = 0.5 * (p_ll * v1_rr + p_rr * v1_ll)
147

148
            # Calculate fluxes depending on Cartesian orientation
149
            f1 = rho_avg * v1_avg
150
            f2 = f1 * v1_avg + p_avg
151
            f3 = f1 * v2_avg
152
            f4 = p_avg * v1_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv1_avg
153

154
            # Add scaled fluxes to RHS
155
            factor_i = alpha * derivative_split[i, ii]
156
            du_permuted[j, i, 1] += factor_i * f1
157
            du_permuted[j, i, 2] += factor_i * f2
158
            du_permuted[j, i, 3] += factor_i * f3
159
            du_permuted[j, i, 4] += factor_i * f4
160

161
            factor_ii = alpha * derivative_split[ii, i]
162
            du_permuted[j, ii, 1] += factor_ii * f1
163
            du_permuted[j, ii, 2] += factor_ii * f2
164
            du_permuted[j, ii, 3] += factor_ii * f3
165
            du_permuted[j, ii, 4] += factor_ii * f4
166
        end
167
    end
168

169
    @turbo for v in eachvariable(equations),
170
               j in eachnode(dg),
171
               i in eachnode(dg)
172

173
        du[i, j, v] = du_permuted[j, i, v]
174
    end
175

176
    # y direction
177
    # The memory layout is already optimal for SIMD vectorization in this loop.
178
    for j in eachnode(dg), jj in (j + 1):nnodes(dg)
179
        @turbo for i in eachnode(dg)
180
            rho_ll = u_prim[i, j, 1]
181
            v1_ll = u_prim[i, j, 2]
182
            v2_ll = u_prim[i, j, 3]
183
            p_ll = u_prim[i, j, 4]
184

185
            rho_rr = u_prim[i, jj, 1]
186
            v1_rr = u_prim[i, jj, 2]
187
            v2_rr = u_prim[i, jj, 3]
188
            p_rr = u_prim[i, jj, 4]
189

190
            # Compute required mean values
191
            rho_avg = 0.5 * (rho_ll + rho_rr)
192
            v1_avg = 0.5 * (v1_ll + v1_rr)
193
            v2_avg = 0.5 * (v2_ll + v2_rr)
194
            p_avg = 0.5 * (p_ll + p_rr)
195
            kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
196
            pv2_avg = 0.5 * (p_ll * v2_rr + p_rr * v2_ll)
197

198
            # Calculate fluxes depending on Cartesian orientation
199
            f1 = rho_avg * v2_avg
200
            f2 = f1 * v1_avg
201
            f3 = f1 * v2_avg + p_avg
202
            f4 = p_avg * v2_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv2_avg
203

204
            # Add scaled fluxes to RHS
205
            factor_j = alpha * derivative_split[j, jj]
206
            du[i, j, 1] += factor_j * f1
207
            du[i, j, 2] += factor_j * f2
208
            du[i, j, 3] += factor_j * f3
209
            du[i, j, 4] += factor_j * f4
210

211
            factor_jj = alpha * derivative_split[jj, j]
212
            du[i, jj, 1] += factor_jj * f1
213
            du[i, jj, 2] += factor_jj * f2
214
            du[i, jj, 3] += factor_jj * f3
215
            du[i, jj, 4] += factor_jj * f4
216
        end
217
    end
218

219
    # Finally, we add the temporary RHS computed here to the global RHS in the
220
    # given `element`.
221
    @turbo for v in eachvariable(equations),
222
               j in eachnode(dg),
223
               i in eachnode(dg)
224

225
        _du[v, i, j, element] += du[i, j, v]
226
    end
227
end
228

229
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
230
                                           element, MeshT::Type{<:TreeMesh{2}},
231
                                           have_nonconservative_terms::False,
232
                                           equations::CompressibleEulerEquations2D,
233
                                           volume_flux::typeof(flux_ranocha_turbo),
234
                                           dg::DGSEM, cache, alpha)
235
    @unpack derivative_split = dg.basis
236

237
    # Create a temporary array that will be used to store the RHS with permuted
238
    # indices `[i, j, v]` to allow using SIMD instructions.
239
    # `StrideArray`s with purely static dimensions do not allocate on the heap.
240
    du = StrideArray{eltype(u_cons)}(undef,
241
                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
242
                                      StaticInt(nvariables(equations))))
243

244
    # Convert conserved to primitive variables on the given `element`. In addition
245
    # to the usual primitive variables, we also compute logarithms of the density
246
    # and pressure to increase the performance of the required logarithmic mean
247
    # values.
248
    u_prim = StrideArray{eltype(u_cons)}(undef,
249
                                         (ntuple(_ -> StaticInt(nnodes(dg)),
250
                                                 ndims(MeshT))...,
251
                                          StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
252

253
    @turbo for j in eachnode(dg), i in eachnode(dg)
254
        rho = u_cons[1, i, j, element]
255
        rho_v1 = u_cons[2, i, j, element]
256
        rho_v2 = u_cons[3, i, j, element]
257
        rho_e_total = u_cons[4, i, j, element]
258

259
        v1 = rho_v1 / rho
260
        v2 = rho_v2 / rho
261
        p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))
262

263
        u_prim[i, j, 1] = rho
264
        u_prim[i, j, 2] = v1
265
        u_prim[i, j, 3] = v2
266
        u_prim[i, j, 4] = p
267
        u_prim[i, j, 5] = log(rho)
268
        u_prim[i, j, 6] = log(p)
269
    end
270

271
    # x direction
272
    # At first, we create new temporary arrays with permuted memory layout to
273
    # allow using SIMD instructions along the first dimension (which is contiguous
274
    # in memory).
275
    du_permuted = StrideArray{eltype(u_cons)}(undef,
276
                                              (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
277
                                               StaticInt(nvariables(equations))))
278

279
    u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
280
                                                  (StaticInt(nnodes(dg)),
281
                                                   StaticInt(nnodes(dg)),
282
                                                   StaticInt(nvariables(equations) + 2)))
283

284
    @turbo for v in indices(u_prim, 3), # v in eachvariable(equations) misses +2 logs
285
               j in eachnode(dg),
286
               i in eachnode(dg)
287

288
        u_prim_permuted[j, i, v] = u_prim[i, j, v]
289
    end
290
    fill!(du_permuted, zero(eltype(du_permuted)))
291

292
    # Next, we basically inline the volume flux. To allow SIMD vectorization and
293
    # still use the symmetry of the volume flux and the derivative matrix, we
294
    # loop over the triangular part in an outer loop and use a plain inner loop.
295
    for i in eachnode(dg), ii in (i + 1):nnodes(dg)
296
        @turbo for j in eachnode(dg)
297
            rho_ll = u_prim_permuted[j, i, 1]
298
            v1_ll = u_prim_permuted[j, i, 2]
299
            v2_ll = u_prim_permuted[j, i, 3]
300
            p_ll = u_prim_permuted[j, i, 4]
301
            log_rho_ll = u_prim_permuted[j, i, 5]
302
            log_p_ll = u_prim_permuted[j, i, 6]
303

304
            rho_rr = u_prim_permuted[j, ii, 1]
305
            v1_rr = u_prim_permuted[j, ii, 2]
306
            v2_rr = u_prim_permuted[j, ii, 3]
307
            p_rr = u_prim_permuted[j, ii, 4]
308
            log_rho_rr = u_prim_permuted[j, ii, 5]
309
            log_p_rr = u_prim_permuted[j, ii, 6]
310

311
            # Compute required mean values
312
            # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
313
            # it efficiently. This is equivalent to
314
            #   rho_mean = ln_mean(rho_ll, rho_rr)
315
            x1 = rho_ll
316
            log_x1 = log_rho_ll
317
            y1 = rho_rr
318
            log_y1 = log_rho_rr
319
            x1_plus_y1 = x1 + y1
320
            y1_minus_x1 = y1 - x1
321
            z1 = y1_minus_x1^2 / x1_plus_y1^2
322
            special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
323
            regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
324
            rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
325

326
            # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
327
            # in exact arithmetic since
328
            #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
329
            #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
330
            # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
331
            x2 = rho_ll * p_rr
332
            log_x2 = log_rho_ll + log_p_rr
333
            y2 = rho_rr * p_ll
334
            log_y2 = log_rho_rr + log_p_ll
335
            x2_plus_y2 = x2 + y2
336
            y2_minus_x2 = y2 - x2
337
            z2 = y2_minus_x2^2 / x2_plus_y2^2
338
            special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
339
            regular_path2 = (log_y2 - log_x2) / y2_minus_x2
340
            inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
341

342
            v1_avg = 0.5 * (v1_ll + v1_rr)
343
            v2_avg = 0.5 * (v2_ll + v2_rr)
344
            p_avg = 0.5 * (p_ll + p_rr)
345
            velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
346

347
            # Calculate fluxes depending on Cartesian orientation
348
            f1 = rho_mean * v1_avg
349
            f2 = f1 * v1_avg + p_avg
350
            f3 = f1 * v2_avg
351
            f4 = f1 *
352
                 (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
353
                 0.5 * (p_ll * v1_rr + p_rr * v1_ll)
354

355
            # Add scaled fluxes to RHS
356
            factor_i = alpha * derivative_split[i, ii]
357
            du_permuted[j, i, 1] += factor_i * f1
358
            du_permuted[j, i, 2] += factor_i * f2
359
            du_permuted[j, i, 3] += factor_i * f3
360
            du_permuted[j, i, 4] += factor_i * f4
361

362
            factor_ii = alpha * derivative_split[ii, i]
363
            du_permuted[j, ii, 1] += factor_ii * f1
364
            du_permuted[j, ii, 2] += factor_ii * f2
365
            du_permuted[j, ii, 3] += factor_ii * f3
366
            du_permuted[j, ii, 4] += factor_ii * f4
367
        end
368
    end
369

370
    @turbo for v in eachvariable(equations),
371
               j in eachnode(dg),
372
               i in eachnode(dg)
373

374
        du[i, j, v] = du_permuted[j, i, v]
375
    end
376

377
    # y direction
378
    # The memory layout is already optimal for SIMD vectorization in this loop.
379
    for j in eachnode(dg), jj in (j + 1):nnodes(dg)
380
        @turbo for i in eachnode(dg)
381
            rho_ll = u_prim[i, j, 1]
382
            v1_ll = u_prim[i, j, 2]
383
            v2_ll = u_prim[i, j, 3]
384
            p_ll = u_prim[i, j, 4]
385
            log_rho_ll = u_prim[i, j, 5]
386
            log_p_ll = u_prim[i, j, 6]
387

388
            rho_rr = u_prim[i, jj, 1]
389
            v1_rr = u_prim[i, jj, 2]
390
            v2_rr = u_prim[i, jj, 3]
391
            p_rr = u_prim[i, jj, 4]
392
            log_rho_rr = u_prim[i, jj, 5]
393
            log_p_rr = u_prim[i, jj, 6]
394

395
            # Compute required mean values
396
            # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
397
            # it efficiently. This is equivalent to
398
            #   rho_mean = ln_mean(rho_ll, rho_rr)
399
            x1 = rho_ll
400
            log_x1 = log_rho_ll
401
            y1 = rho_rr
402
            log_y1 = log_rho_rr
403
            x1_plus_y1 = x1 + y1
404
            y1_minus_x1 = y1 - x1
405
            z1 = y1_minus_x1^2 / x1_plus_y1^2
406
            special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
407
            regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
408
            rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
409

410
            # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
411
            # in exact arithmetic since
412
            #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
413
            #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
414
            # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
415
            x2 = rho_ll * p_rr
416
            log_x2 = log_rho_ll + log_p_rr
417
            y2 = rho_rr * p_ll
418
            log_y2 = log_rho_rr + log_p_ll
419
            x2_plus_y2 = x2 + y2
420
            y2_minus_x2 = y2 - x2
421
            z2 = y2_minus_x2^2 / x2_plus_y2^2
422
            special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
423
            regular_path2 = (log_y2 - log_x2) / y2_minus_x2
424
            inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
425

426
            v1_avg = 0.5 * (v1_ll + v1_rr)
427
            v2_avg = 0.5 * (v2_ll + v2_rr)
428
            p_avg = 0.5 * (p_ll + p_rr)
429
            velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)
430

431
            # Calculate fluxes depending on Cartesian orientation
432
            f1 = rho_mean * v2_avg
433
            f2 = f1 * v1_avg
434
            f3 = f1 * v2_avg + p_avg
435
            f4 = f1 *
436
                 (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +
437
                 0.5 * (p_ll * v2_rr + p_rr * v2_ll)
438

439
            # Add scaled fluxes to RHS
440
            factor_j = alpha * derivative_split[j, jj]
441
            du[i, j, 1] += factor_j * f1
442
            du[i, j, 2] += factor_j * f2
443
            du[i, j, 3] += factor_j * f3
444
            du[i, j, 4] += factor_j * f4
445

446
            factor_jj = alpha * derivative_split[jj, j]
447
            du[i, jj, 1] += factor_jj * f1
448
            du[i, jj, 2] += factor_jj * f2
449
            du[i, jj, 3] += factor_jj * f3
450
            du[i, jj, 4] += factor_jj * f4
451
        end
452
    end
453

454
    # Finally, we add the temporary RHS computed here to the global RHS in the
455
    # given `element`.
456
    @turbo for v in eachvariable(equations),
457
               j in eachnode(dg),
458
               i in eachnode(dg)
459

460
        _du[v, i, j, element] += du[i, j, v]
461
    end
462
end
463

464
Product

Resources

Company