CoCalc -- dg_3d_compressible

GitHub Repository: trixi-framework/Trixi.jl
Path: blob/main/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
⁵⁵⁹⁰ views
1

2
# From here on, this file contains specializations of DG methods on the
3
# curved 3D meshes `StructuredMesh{3}, P4estMesh{3}` to the compressible
4
# Euler equations.
5
#
6
# The specialized methods contain relatively verbose and ugly code in the sense
7
# that we do not use the common "pointwise physics" interface of Trixi.jl.
8
# However, this is currently (November 2021) necessary to get a significant
9
# speed-up by using SIMD instructions via LoopVectorization.jl.
10
#
11
# TODO: SIMD. We could get even more speed-up if we did not need to permute
12
#             array dimensions, i.e., if we changed the basic memory layout.
13
#
14
# We do not wrap this code in `@muladd begin ... end` block. Optimizations like
15
# this are handled automatically by LoopVectorization.jl.
16

17
# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`
18
# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`
19
# works efficiently here.
20
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
21
                                           element,
22
                                           MeshT::Type{<:Union{StructuredMesh{3},
23
                                                               P4estMesh{3}}},
24
                                           have_nonconservative_terms::False,
25
                                           equations::CompressibleEulerEquations3D,
26
                                           volume_flux::typeof(flux_shima_etal_turbo),
27
                                           dg::DGSEM, cache, alpha)
28
    @unpack derivative_split = dg.basis
29
    @unpack contravariant_vectors = cache.elements
30

31
    # Create a temporary array that will be used to store the RHS with permuted
32
    # indices `[i, j, k, v]` to allow using SIMD instructions.
33
    # `StrideArray`s with purely static dimensions do not allocate on the heap.
34
    du = StrideArray{eltype(u_cons)}(undef,
35
                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
36
                                      StaticInt(nvariables(equations))))
37

38
    # Convert conserved to primitive variables on the given `element`.
39
    u_prim = StrideArray{eltype(u_cons)}(undef,
40
                                         (ntuple(_ -> StaticInt(nnodes(dg)),
41
                                                 ndims(MeshT))...,
42
                                          StaticInt(nvariables(equations))))
43

44
    @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
45
        rho = u_cons[1, i, j, k, element]
46
        rho_v1 = u_cons[2, i, j, k, element]
47
        rho_v2 = u_cons[3, i, j, k, element]
48
        rho_v3 = u_cons[4, i, j, k, element]
49
        rho_e_total = u_cons[5, i, j, k, element]
50

51
        v1 = rho_v1 / rho
52
        v2 = rho_v2 / rho
53
        v3 = rho_v3 / rho
54
        p = (equations.gamma - 1) *
55
            (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
56

57
        u_prim[i, j, k, 1] = rho
58
        u_prim[i, j, k, 2] = v1
59
        u_prim[i, j, k, 3] = v2
60
        u_prim[i, j, k, 4] = v3
61
        u_prim[i, j, k, 5] = p
62
    end
63

64
    # x direction
65
    # At first, we create new temporary arrays with permuted memory layout to
66
    # allow using SIMD instructions along the first dimension (which is contiguous
67
    # in memory).
68
    du_permuted = StrideArray{eltype(u_cons)}(undef,
69
                                              (StaticInt(nnodes(dg)^2),
70
                                               StaticInt(nnodes(dg)),
71
                                               StaticInt(nvariables(equations))))
72

73
    u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
74
                                                  (StaticInt(nnodes(dg)^2),
75
                                                   StaticInt(nnodes(dg)),
76
                                                   StaticInt(nvariables(equations))))
77

78
    @turbo for v in eachvariable(equations),
79
               k in eachnode(dg),
80
               j in eachnode(dg),
81
               i in eachnode(dg)
82

83
        jk = j + nnodes(dg) * (k - 1)
84
        u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
85
    end
86
    fill!(du_permuted, zero(eltype(du_permuted)))
87

88
    # We must also permute the contravariant vectors.
89
    contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
90
                                                                         (StaticInt(nnodes(dg)^2),
91
                                                                          StaticInt(nnodes(dg)),
92
                                                                          StaticInt(ndims(MeshT))))
93

94
    @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
95
        jk = j + nnodes(dg) * (k - 1)
96
        contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, element]
97
        contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, element]
98
        contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, element]
99
    end
100

101
    # Next, we basically inline the volume flux. To allow SIMD vectorization and
102
    # still use the symmetry of the volume flux and the derivative matrix, we
103
    # loop over the triangular part in an outer loop and use a plain inner loop.
104
    for i in eachnode(dg), ii in (i + 1):nnodes(dg)
105
        @turbo for jk in Base.OneTo(nnodes(dg)^2)
106
            rho_ll = u_prim_permuted[jk, i, 1]
107
            v1_ll = u_prim_permuted[jk, i, 2]
108
            v2_ll = u_prim_permuted[jk, i, 3]
109
            v3_ll = u_prim_permuted[jk, i, 4]
110
            p_ll = u_prim_permuted[jk, i, 5]
111

112
            rho_rr = u_prim_permuted[jk, ii, 1]
113
            v1_rr = u_prim_permuted[jk, ii, 2]
114
            v2_rr = u_prim_permuted[jk, ii, 3]
115
            v3_rr = u_prim_permuted[jk, ii, 4]
116
            p_rr = u_prim_permuted[jk, ii, 5]
117

118
            normal_direction_1 = 0.5 * (contravariant_vectors_x[jk, i, 1] +
119
                                  contravariant_vectors_x[jk, ii, 1])
120
            normal_direction_2 = 0.5 * (contravariant_vectors_x[jk, i, 2] +
121
                                  contravariant_vectors_x[jk, ii, 2])
122
            normal_direction_3 = 0.5 * (contravariant_vectors_x[jk, i, 3] +
123
                                  contravariant_vectors_x[jk, ii, 3])
124

125
            v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
126
                         v3_ll * normal_direction_3
127
            v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
128
                         v3_rr * normal_direction_3
129

130
            # Compute required mean values
131
            rho_avg = 0.5 * (rho_ll + rho_rr)
132
            v1_avg = 0.5 * (v1_ll + v1_rr)
133
            v2_avg = 0.5 * (v2_ll + v2_rr)
134
            v3_avg = 0.5 * (v3_ll + v3_rr)
135
            v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)
136
            p_avg = 0.5 * (p_ll + p_rr)
137
            velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
138

139
            # Calculate fluxes depending on normal_direction
140
            f1 = rho_avg * v_dot_n_avg
141
            f2 = f1 * v1_avg + p_avg * normal_direction_1
142
            f3 = f1 * v2_avg + p_avg * normal_direction_2
143
            f4 = f1 * v3_avg + p_avg * normal_direction_3
144
            f5 = (f1 * velocity_square_avg +
145
                  p_avg * v_dot_n_avg * equations.inv_gamma_minus_one
146
                  + 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
147

148
            # Add scaled fluxes to RHS
149
            factor_i = alpha * derivative_split[i, ii]
150
            du_permuted[jk, i, 1] += factor_i * f1
151
            du_permuted[jk, i, 2] += factor_i * f2
152
            du_permuted[jk, i, 3] += factor_i * f3
153
            du_permuted[jk, i, 4] += factor_i * f4
154
            du_permuted[jk, i, 5] += factor_i * f5
155

156
            factor_ii = alpha * derivative_split[ii, i]
157
            du_permuted[jk, ii, 1] += factor_ii * f1
158
            du_permuted[jk, ii, 2] += factor_ii * f2
159
            du_permuted[jk, ii, 3] += factor_ii * f3
160
            du_permuted[jk, ii, 4] += factor_ii * f4
161
            du_permuted[jk, ii, 5] += factor_ii * f5
162
        end
163
    end
164

165
    @turbo for v in eachvariable(equations),
166
               k in eachnode(dg),
167
               j in eachnode(dg),
168
               i in eachnode(dg)
169

170
        jk = j + nnodes(dg) * (k - 1)
171
        du[i, j, k, v] = du_permuted[jk, i, v]
172
    end
173

174
    # y direction
175
    # We must also permute the contravariant vectors.
176
    contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
177
                                                                         (StaticInt(nnodes(dg)),
178
                                                                          StaticInt(nnodes(dg)),
179
                                                                          StaticInt(nnodes(dg)),
180
                                                                          StaticInt(ndims(MeshT))))
181

182
    @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
183
        contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
184
        contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, element]
185
        contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, element]
186
    end
187

188
    # A possible permutation of array dimensions with improved opportunities for
189
    # SIMD vectorization appeared to be slower than the direct version used here
190
    # in preliminary numerical experiments on an AVX2 system.
191
    for j in eachnode(dg), jj in (j + 1):nnodes(dg)
192
        @turbo for k in eachnode(dg), i in eachnode(dg)
193
            rho_ll = u_prim[i, j, k, 1]
194
            v1_ll = u_prim[i, j, k, 2]
195
            v2_ll = u_prim[i, j, k, 3]
196
            v3_ll = u_prim[i, j, k, 4]
197
            p_ll = u_prim[i, j, k, 5]
198

199
            rho_rr = u_prim[i, jj, k, 1]
200
            v1_rr = u_prim[i, jj, k, 2]
201
            v2_rr = u_prim[i, jj, k, 3]
202
            v3_rr = u_prim[i, jj, k, 4]
203
            p_rr = u_prim[i, jj, k, 5]
204

205
            normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, k, 1] +
206
                                  contravariant_vectors_y[i, jj, k, 1])
207
            normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, k, 2] +
208
                                  contravariant_vectors_y[i, jj, k, 2])
209
            normal_direction_3 = 0.5 * (contravariant_vectors_y[i, j, k, 3] +
210
                                  contravariant_vectors_y[i, jj, k, 3])
211

212
            v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
213
                         v3_ll * normal_direction_3
214
            v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
215
                         v3_rr * normal_direction_3
216

217
            # Compute required mean values
218
            rho_avg = 0.5 * (rho_ll + rho_rr)
219
            v1_avg = 0.5 * (v1_ll + v1_rr)
220
            v2_avg = 0.5 * (v2_ll + v2_rr)
221
            v3_avg = 0.5 * (v3_ll + v3_rr)
222
            v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)
223
            p_avg = 0.5 * (p_ll + p_rr)
224
            velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
225

226
            # Calculate fluxes depending on normal_direction
227
            f1 = rho_avg * v_dot_n_avg
228
            f2 = f1 * v1_avg + p_avg * normal_direction_1
229
            f3 = f1 * v2_avg + p_avg * normal_direction_2
230
            f4 = f1 * v3_avg + p_avg * normal_direction_3
231
            f5 = (f1 * velocity_square_avg +
232
                  p_avg * v_dot_n_avg * equations.inv_gamma_minus_one
233
                  + 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
234

235
            # Add scaled fluxes to RHS
236
            factor_j = alpha * derivative_split[j, jj]
237
            du[i, j, k, 1] += factor_j * f1
238
            du[i, j, k, 2] += factor_j * f2
239
            du[i, j, k, 3] += factor_j * f3
240
            du[i, j, k, 4] += factor_j * f4
241
            du[i, j, k, 5] += factor_j * f5
242

243
            factor_jj = alpha * derivative_split[jj, j]
244
            du[i, jj, k, 1] += factor_jj * f1
245
            du[i, jj, k, 2] += factor_jj * f2
246
            du[i, jj, k, 3] += factor_jj * f3
247
            du[i, jj, k, 4] += factor_jj * f4
248
            du[i, jj, k, 5] += factor_jj * f5
249
        end
250
    end
251

252
    # z direction
253
    # The memory layout is already optimal for SIMD vectorization in this loop.
254
    # We just squeeze the first two dimensions to make the code slightly faster.
255
    GC.@preserve u_prim begin
256
        u_prim_reshaped = PtrArray(pointer(u_prim),
257
                                   (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
258
                                    StaticInt(nvariables(equations))))
259

260
        du_reshaped = PtrArray(pointer(du),
261
                               (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
262
                                StaticInt(nvariables(equations))))
263

264
        # We must also permute the contravariant vectors.
265
        contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
266
                                                                             (StaticInt(nnodes(dg)^2),
267
                                                                              StaticInt(nnodes(dg)),
268
                                                                              StaticInt(ndims(MeshT))))
269

270
        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
271
            ij = i + nnodes(dg) * (j - 1)
272
            contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
273
                                                                      element]
274
            contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
275
                                                                      element]
276
            contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
277
                                                                      element]
278
        end
279

280
        for k in eachnode(dg), kk in (k + 1):nnodes(dg)
281
            @turbo for ij in Base.OneTo(nnodes(dg)^2)
282
                rho_ll = u_prim_reshaped[ij, k, 1]
283
                v1_ll = u_prim_reshaped[ij, k, 2]
284
                v2_ll = u_prim_reshaped[ij, k, 3]
285
                v3_ll = u_prim_reshaped[ij, k, 4]
286
                p_ll = u_prim_reshaped[ij, k, 5]
287

288
                rho_rr = u_prim_reshaped[ij, kk, 1]
289
                v1_rr = u_prim_reshaped[ij, kk, 2]
290
                v2_rr = u_prim_reshaped[ij, kk, 3]
291
                v3_rr = u_prim_reshaped[ij, kk, 4]
292
                p_rr = u_prim_reshaped[ij, kk, 5]
293

294
                normal_direction_1 = 0.5 * (contravariant_vectors_z[ij, k, 1] +
295
                                      contravariant_vectors_z[ij, kk, 1])
296
                normal_direction_2 = 0.5 * (contravariant_vectors_z[ij, k, 2] +
297
                                      contravariant_vectors_z[ij, kk, 2])
298
                normal_direction_3 = 0.5 * (contravariant_vectors_z[ij, k, 3] +
299
                                      contravariant_vectors_z[ij, kk, 3])
300

301
                v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
302
                             v3_ll * normal_direction_3
303
                v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
304
                             v3_rr * normal_direction_3
305

306
                # Compute required mean values
307
                rho_avg = 0.5 * (rho_ll + rho_rr)
308
                v1_avg = 0.5 * (v1_ll + v1_rr)
309
                v2_avg = 0.5 * (v2_ll + v2_rr)
310
                v3_avg = 0.5 * (v3_ll + v3_rr)
311
                v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)
312
                p_avg = 0.5 * (p_ll + p_rr)
313
                velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
314

315
                # Calculate fluxes depending on normal_direction
316
                f1 = rho_avg * v_dot_n_avg
317
                f2 = f1 * v1_avg + p_avg * normal_direction_1
318
                f3 = f1 * v2_avg + p_avg * normal_direction_2
319
                f4 = f1 * v3_avg + p_avg * normal_direction_3
320
                f5 = (f1 * velocity_square_avg +
321
                      p_avg * v_dot_n_avg * equations.inv_gamma_minus_one
322
                      + 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
323

324
                # Add scaled fluxes to RHS
325
                factor_k = alpha * derivative_split[k, kk]
326
                du_reshaped[ij, k, 1] += factor_k * f1
327
                du_reshaped[ij, k, 2] += factor_k * f2
328
                du_reshaped[ij, k, 3] += factor_k * f3
329
                du_reshaped[ij, k, 4] += factor_k * f4
330
                du_reshaped[ij, k, 5] += factor_k * f5
331

332
                factor_kk = alpha * derivative_split[kk, k]
333
                du_reshaped[ij, kk, 1] += factor_kk * f1
334
                du_reshaped[ij, kk, 2] += factor_kk * f2
335
                du_reshaped[ij, kk, 3] += factor_kk * f3
336
                du_reshaped[ij, kk, 4] += factor_kk * f4
337
                du_reshaped[ij, kk, 5] += factor_kk * f5
338
            end
339
        end
340
    end # GC.@preserve u_prim begin
341

342
    # Finally, we add the temporary RHS computed here to the global RHS in the
343
    # given `element`.
344
    @turbo for v in eachvariable(equations),
345
               k in eachnode(dg),
346
               j in eachnode(dg),
347
               i in eachnode(dg)
348

349
        _du[v, i, j, k, element] += du[i, j, k, v]
350
    end
351
end
352

353
@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,
354
                                           element,
355
                                           MeshT::Type{<:Union{StructuredMesh{3},
356
                                                               P4estMesh{3}}},
357
                                           have_nonconservative_terms::False,
358
                                           equations::CompressibleEulerEquations3D,
359
                                           volume_flux::typeof(flux_ranocha_turbo),
360
                                           dg::DGSEM, cache, alpha)
361
    @unpack derivative_split = dg.basis
362
    @unpack contravariant_vectors = cache.elements
363

364
    # Create a temporary array that will be used to store the RHS with permuted
365
    # indices `[i, j, k, v]` to allow using SIMD instructions.
366
    # `StrideArray`s with purely static dimensions do not allocate on the heap.
367
    du = StrideArray{eltype(u_cons)}(undef,
368
                                     (ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,
369
                                      StaticInt(nvariables(equations))))
370

371
    # Convert conserved to primitive variables on the given `element`. In addition
372
    # to the usual primitive variables, we also compute logarithms of the density
373
    # and pressure to increase the performance of the required logarithmic mean
374
    # values.
375
    u_prim = StrideArray{eltype(u_cons)}(undef,
376
                                         (ntuple(_ -> StaticInt(nnodes(dg)),
377
                                                 ndims(MeshT))...,
378
                                          StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs
379

380
    @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
381
        rho = u_cons[1, i, j, k, element]
382
        rho_v1 = u_cons[2, i, j, k, element]
383
        rho_v2 = u_cons[3, i, j, k, element]
384
        rho_v3 = u_cons[4, i, j, k, element]
385
        rho_e_total = u_cons[5, i, j, k, element]
386

387
        v1 = rho_v1 / rho
388
        v2 = rho_v2 / rho
389
        v3 = rho_v3 / rho
390
        p = (equations.gamma - 1) *
391
            (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
392

393
        u_prim[i, j, k, 1] = rho
394
        u_prim[i, j, k, 2] = v1
395
        u_prim[i, j, k, 3] = v2
396
        u_prim[i, j, k, 4] = v3
397
        u_prim[i, j, k, 5] = p
398
        u_prim[i, j, k, 6] = log(rho)
399
        u_prim[i, j, k, 7] = log(p)
400
    end
401

402
    # x direction
403
    # At first, we create new temporary arrays with permuted memory layout to
404
    # allow using SIMD instructions along the first dimension (which is contiguous
405
    # in memory).
406
    du_permuted = StrideArray{eltype(u_cons)}(undef,
407
                                              (StaticInt(nnodes(dg)^2),
408
                                               StaticInt(nnodes(dg)),
409
                                               StaticInt(nvariables(equations))))
410

411
    u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
412
                                                  (StaticInt(nnodes(dg)^2),
413
                                                   StaticInt(nnodes(dg)),
414
                                                   StaticInt(nvariables(equations) + 2)))
415

416
    @turbo for v in indices(u_prim, 4), # v in eachvariable(equations) misses +2 logs
417
               k in eachnode(dg),
418
               j in eachnode(dg),
419
               i in eachnode(dg)
420

421
        jk = j + nnodes(dg) * (k - 1)
422
        u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
423
    end
424
    fill!(du_permuted, zero(eltype(du_permuted)))
425

426
    # We must also permute the contravariant vectors.
427
    contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
428
                                                                         (StaticInt(nnodes(dg)^2),
429
                                                                          StaticInt(nnodes(dg)),
430
                                                                          StaticInt(ndims(MeshT))))
431

432
    @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
433
        jk = j + nnodes(dg) * (k - 1)
434
        contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, element]
435
        contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, element]
436
        contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, element]
437
    end
438

439
    # Next, we basically inline the volume flux. To allow SIMD vectorization and
440
    # still use the symmetry of the volume flux and the derivative matrix, we
441
    # loop over the triangular part in an outer loop and use a plain inner loop.
442
    for i in eachnode(dg), ii in (i + 1):nnodes(dg)
443
        @turbo for jk in Base.OneTo(nnodes(dg)^2)
444
            rho_ll = u_prim_permuted[jk, i, 1]
445
            v1_ll = u_prim_permuted[jk, i, 2]
446
            v2_ll = u_prim_permuted[jk, i, 3]
447
            v3_ll = u_prim_permuted[jk, i, 4]
448
            p_ll = u_prim_permuted[jk, i, 5]
449
            log_rho_ll = u_prim_permuted[jk, i, 6]
450
            log_p_ll = u_prim_permuted[jk, i, 7]
451

452
            rho_rr = u_prim_permuted[jk, ii, 1]
453
            v1_rr = u_prim_permuted[jk, ii, 2]
454
            v2_rr = u_prim_permuted[jk, ii, 3]
455
            v3_rr = u_prim_permuted[jk, ii, 4]
456
            p_rr = u_prim_permuted[jk, ii, 5]
457
            log_rho_rr = u_prim_permuted[jk, ii, 6]
458
            log_p_rr = u_prim_permuted[jk, ii, 7]
459

460
            normal_direction_1 = 0.5 * (contravariant_vectors_x[jk, i, 1] +
461
                                  contravariant_vectors_x[jk, ii, 1])
462
            normal_direction_2 = 0.5 * (contravariant_vectors_x[jk, i, 2] +
463
                                  contravariant_vectors_x[jk, ii, 2])
464
            normal_direction_3 = 0.5 * (contravariant_vectors_x[jk, i, 3] +
465
                                  contravariant_vectors_x[jk, ii, 3])
466

467
            v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
468
                         v3_ll * normal_direction_3
469
            v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
470
                         v3_rr * normal_direction_3
471

472
            # Compute required mean values
473
            # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
474
            # it efficiently. This is equivalent to
475
            #   rho_mean = ln_mean(rho_ll, rho_rr)
476
            x1 = rho_ll
477
            log_x1 = log_rho_ll
478
            y1 = rho_rr
479
            log_y1 = log_rho_rr
480
            x1_plus_y1 = x1 + y1
481
            y1_minus_x1 = y1 - x1
482
            z1 = y1_minus_x1^2 / x1_plus_y1^2
483
            special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
484
            regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
485
            rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
486

487
            # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
488
            # in exact arithmetic since
489
            #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
490
            #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
491
            # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
492
            x2 = rho_ll * p_rr
493
            log_x2 = log_rho_ll + log_p_rr
494
            y2 = rho_rr * p_ll
495
            log_y2 = log_rho_rr + log_p_ll
496
            x2_plus_y2 = x2 + y2
497
            y2_minus_x2 = y2 - x2
498
            z2 = y2_minus_x2^2 / x2_plus_y2^2
499
            special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
500
            regular_path2 = (log_y2 - log_x2) / y2_minus_x2
501
            inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
502

503
            v1_avg = 0.5 * (v1_ll + v1_rr)
504
            v2_avg = 0.5 * (v2_ll + v2_rr)
505
            v3_avg = 0.5 * (v3_ll + v3_rr)
506
            p_avg = 0.5 * (p_ll + p_rr)
507
            velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
508

509
            # Calculate fluxes depending on normal_direction
510
            f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
511
            f2 = f1 * v1_avg + p_avg * normal_direction_1
512
            f3 = f1 * v2_avg + p_avg * normal_direction_2
513
            f4 = f1 * v3_avg + p_avg * normal_direction_3
514
            f5 = (f1 *
515
                  (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
516
                  +
517
                  0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
518

519
            # Add scaled fluxes to RHS
520
            factor_i = alpha * derivative_split[i, ii]
521
            du_permuted[jk, i, 1] += factor_i * f1
522
            du_permuted[jk, i, 2] += factor_i * f2
523
            du_permuted[jk, i, 3] += factor_i * f3
524
            du_permuted[jk, i, 4] += factor_i * f4
525
            du_permuted[jk, i, 5] += factor_i * f5
526

527
            factor_ii = alpha * derivative_split[ii, i]
528
            du_permuted[jk, ii, 1] += factor_ii * f1
529
            du_permuted[jk, ii, 2] += factor_ii * f2
530
            du_permuted[jk, ii, 3] += factor_ii * f3
531
            du_permuted[jk, ii, 4] += factor_ii * f4
532
            du_permuted[jk, ii, 5] += factor_ii * f5
533
        end
534
    end
535

536
    @turbo for v in eachvariable(equations),
537
               k in eachnode(dg),
538
               j in eachnode(dg),
539
               i in eachnode(dg)
540

541
        jk = j + nnodes(dg) * (k - 1)
542
        du[i, j, k, v] = du_permuted[jk, i, v]
543
    end
544

545
    # y direction
546
    # We must also permute the contravariant vectors.
547
    contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
548
                                                                         (StaticInt(nnodes(dg)),
549
                                                                          StaticInt(nnodes(dg)),
550
                                                                          StaticInt(nnodes(dg)),
551
                                                                          StaticInt(ndims(MeshT))))
552

553
    @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
554
        contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]
555
        contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, element]
556
        contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, element]
557
    end
558

559
    # A possible permutation of array dimensions with improved opportunities for
560
    # SIMD vectorization appeared to be slower than the direct version used here
561
    # in preliminary numerical experiments on an AVX2 system.
562
    for j in eachnode(dg), jj in (j + 1):nnodes(dg)
563
        @turbo for k in eachnode(dg), i in eachnode(dg)
564
            rho_ll = u_prim[i, j, k, 1]
565
            v1_ll = u_prim[i, j, k, 2]
566
            v2_ll = u_prim[i, j, k, 3]
567
            v3_ll = u_prim[i, j, k, 4]
568
            p_ll = u_prim[i, j, k, 5]
569
            log_rho_ll = u_prim[i, j, k, 6]
570
            log_p_ll = u_prim[i, j, k, 7]
571

572
            rho_rr = u_prim[i, jj, k, 1]
573
            v1_rr = u_prim[i, jj, k, 2]
574
            v2_rr = u_prim[i, jj, k, 3]
575
            v3_rr = u_prim[i, jj, k, 4]
576
            p_rr = u_prim[i, jj, k, 5]
577
            log_rho_rr = u_prim[i, jj, k, 6]
578
            log_p_rr = u_prim[i, jj, k, 7]
579

580
            normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, k, 1] +
581
                                  contravariant_vectors_y[i, jj, k, 1])
582
            normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, k, 2] +
583
                                  contravariant_vectors_y[i, jj, k, 2])
584
            normal_direction_3 = 0.5 * (contravariant_vectors_y[i, j, k, 3] +
585
                                  contravariant_vectors_y[i, jj, k, 3])
586

587
            v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
588
                         v3_ll * normal_direction_3
589
            v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
590
                         v3_rr * normal_direction_3
591

592
            # Compute required mean values
593
            # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
594
            # it efficiently. This is equivalent to
595
            #   rho_mean = ln_mean(rho_ll, rho_rr)
596
            x1 = rho_ll
597
            log_x1 = log_rho_ll
598
            y1 = rho_rr
599
            log_y1 = log_rho_rr
600
            x1_plus_y1 = x1 + y1
601
            y1_minus_x1 = y1 - x1
602
            z1 = y1_minus_x1^2 / x1_plus_y1^2
603
            special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
604
            regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
605
            rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
606

607
            # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
608
            # in exact arithmetic since
609
            #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
610
            #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
611
            # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
612
            x2 = rho_ll * p_rr
613
            log_x2 = log_rho_ll + log_p_rr
614
            y2 = rho_rr * p_ll
615
            log_y2 = log_rho_rr + log_p_ll
616
            x2_plus_y2 = x2 + y2
617
            y2_minus_x2 = y2 - x2
618
            z2 = y2_minus_x2^2 / x2_plus_y2^2
619
            special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
620
            regular_path2 = (log_y2 - log_x2) / y2_minus_x2
621
            inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
622

623
            v1_avg = 0.5 * (v1_ll + v1_rr)
624
            v2_avg = 0.5 * (v2_ll + v2_rr)
625
            v3_avg = 0.5 * (v3_ll + v3_rr)
626
            p_avg = 0.5 * (p_ll + p_rr)
627
            velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
628

629
            # Calculate fluxes depending on normal_direction
630
            f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
631
            f2 = f1 * v1_avg + p_avg * normal_direction_1
632
            f3 = f1 * v2_avg + p_avg * normal_direction_2
633
            f4 = f1 * v3_avg + p_avg * normal_direction_3
634
            f5 = (f1 *
635
                  (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
636
                  +
637
                  0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
638

639
            # Add scaled fluxes to RHS
640
            factor_j = alpha * derivative_split[j, jj]
641
            du[i, j, k, 1] += factor_j * f1
642
            du[i, j, k, 2] += factor_j * f2
643
            du[i, j, k, 3] += factor_j * f3
644
            du[i, j, k, 4] += factor_j * f4
645
            du[i, j, k, 5] += factor_j * f5
646

647
            factor_jj = alpha * derivative_split[jj, j]
648
            du[i, jj, k, 1] += factor_jj * f1
649
            du[i, jj, k, 2] += factor_jj * f2
650
            du[i, jj, k, 3] += factor_jj * f3
651
            du[i, jj, k, 4] += factor_jj * f4
652
            du[i, jj, k, 5] += factor_jj * f5
653
        end
654
    end
655

656
    # z direction
657
    # The memory layout is already optimal for SIMD vectorization in this loop.
658
    # We just squeeze the first two dimensions to make the code slightly faster.
659
    GC.@preserve u_prim begin
660
        u_prim_reshaped = PtrArray(pointer(u_prim),
661
                                   (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
662
                                    StaticInt(nvariables(equations) + 2)))
663

664
        du_reshaped = PtrArray(pointer(du),
665
                               (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
666
                                StaticInt(nvariables(equations))))
667

668
        # We must also permute the contravariant vectors.
669
        contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
670
                                                                             (StaticInt(nnodes(dg)^2),
671
                                                                              StaticInt(nnodes(dg)),
672
                                                                              StaticInt(ndims(MeshT))))
673

674
        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
675
            ij = i + nnodes(dg) * (j - 1)
676
            contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
677
                                                                      element]
678
            contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
679
                                                                      element]
680
            contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
681
                                                                      element]
682
        end
683

684
        for k in eachnode(dg), kk in (k + 1):nnodes(dg)
685
            @turbo for ij in Base.OneTo(nnodes(dg)^2)
686
                rho_ll = u_prim_reshaped[ij, k, 1]
687
                v1_ll = u_prim_reshaped[ij, k, 2]
688
                v2_ll = u_prim_reshaped[ij, k, 3]
689
                v3_ll = u_prim_reshaped[ij, k, 4]
690
                p_ll = u_prim_reshaped[ij, k, 5]
691
                log_rho_ll = u_prim_reshaped[ij, k, 6]
692
                log_p_ll = u_prim_reshaped[ij, k, 7]
693

694
                rho_rr = u_prim_reshaped[ij, kk, 1]
695
                v1_rr = u_prim_reshaped[ij, kk, 2]
696
                v2_rr = u_prim_reshaped[ij, kk, 3]
697
                v3_rr = u_prim_reshaped[ij, kk, 4]
698
                p_rr = u_prim_reshaped[ij, kk, 5]
699
                log_rho_rr = u_prim_reshaped[ij, kk, 6]
700
                log_p_rr = u_prim_reshaped[ij, kk, 7]
701

702
                normal_direction_1 = 0.5 * (contravariant_vectors_z[ij, k, 1] +
703
                                      contravariant_vectors_z[ij, kk, 1])
704
                normal_direction_2 = 0.5 * (contravariant_vectors_z[ij, k, 2] +
705
                                      contravariant_vectors_z[ij, kk, 2])
706
                normal_direction_3 = 0.5 * (contravariant_vectors_z[ij, k, 3] +
707
                                      contravariant_vectors_z[ij, kk, 3])
708

709
                v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
710
                             v3_ll * normal_direction_3
711
                v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
712
                             v3_rr * normal_direction_3
713

714
                # Compute required mean values
715
                # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
716
                # it efficiently. This is equivalent to
717
                #   rho_mean = ln_mean(rho_ll, rho_rr)
718
                x1 = rho_ll
719
                log_x1 = log_rho_ll
720
                y1 = rho_rr
721
                log_y1 = log_rho_rr
722
                x1_plus_y1 = x1 + y1
723
                y1_minus_x1 = y1 - x1
724
                z1 = y1_minus_x1^2 / x1_plus_y1^2
725
                special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
726
                regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
727
                rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
728

729
                # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
730
                # in exact arithmetic since
731
                #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
732
                #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
733
                # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
734
                x2 = rho_ll * p_rr
735
                log_x2 = log_rho_ll + log_p_rr
736
                y2 = rho_rr * p_ll
737
                log_y2 = log_rho_rr + log_p_ll
738
                x2_plus_y2 = x2 + y2
739
                y2_minus_x2 = y2 - x2
740
                z2 = y2_minus_x2^2 / x2_plus_y2^2
741
                special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
742
                regular_path2 = (log_y2 - log_x2) / y2_minus_x2
743
                inv_rho_p_mean = p_ll * p_rr *
744
                                 ifelse(z2 < 1.0e-4, special_path2, regular_path2)
745

746
                v1_avg = 0.5 * (v1_ll + v1_rr)
747
                v2_avg = 0.5 * (v2_ll + v2_rr)
748
                v3_avg = 0.5 * (v3_ll + v3_rr)
749
                p_avg = 0.5 * (p_ll + p_rr)
750
                velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
751

752
                # Calculate fluxes depending on normal_direction
753
                f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
754
                f2 = f1 * v1_avg + p_avg * normal_direction_1
755
                f3 = f1 * v2_avg + p_avg * normal_direction_2
756
                f4 = f1 * v3_avg + p_avg * normal_direction_3
757
                f5 = (f1 *
758
                      (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
759
                      +
760
                      0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
761

762
                # Add scaled fluxes to RHS
763
                factor_k = alpha * derivative_split[k, kk]
764
                du_reshaped[ij, k, 1] += factor_k * f1
765
                du_reshaped[ij, k, 2] += factor_k * f2
766
                du_reshaped[ij, k, 3] += factor_k * f3
767
                du_reshaped[ij, k, 4] += factor_k * f4
768
                du_reshaped[ij, k, 5] += factor_k * f5
769

770
                factor_kk = alpha * derivative_split[kk, k]
771
                du_reshaped[ij, kk, 1] += factor_kk * f1
772
                du_reshaped[ij, kk, 2] += factor_kk * f2
773
                du_reshaped[ij, kk, 3] += factor_kk * f3
774
                du_reshaped[ij, kk, 4] += factor_kk * f4
775
                du_reshaped[ij, kk, 5] += factor_kk * f5
776
            end
777
        end
778
    end # GC.@preserve u_prim begin
779

780
    # Finally, we add the temporary RHS computed here to the global RHS in the
781
    # given `element`.
782
    @turbo for v in eachvariable(equations),
783
               k in eachnode(dg),
784
               j in eachnode(dg),
785
               i in eachnode(dg)
786

787
        _du[v, i, j, k, element] += du[i, j, k, v]
788
    end
789
end
790

791
Product

Resources

Company