Path: blob/main/src/solvers/dgsem_structured/dg_2d_compressible_euler.jl
5590 views
1# From here on, this file contains specializations of DG methods on the2# curved 3D meshes `StructuredMesh{3}, P4estMesh{3}` to the compressible3# Euler equations.4#5# The specialized methods contain relatively verbose and ugly code in the sense6# that we do not use the common "pointwise physics" interface of Trixi.jl.7# However, this is currently (November 2021) necessary to get a significant8# speed-up by using SIMD instructions via LoopVectorization.jl.9#10# TODO: SIMD. We could get even more speed-up if we did not need to permute11# array dimensions, i.e., if we changed the basic memory layout.12#13# We do not wrap this code in `@muladd begin ... end` block. Optimizations like14# this are handled automatically by LoopVectorization.jl.1516# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`17# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`18# works efficiently here.19@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,20element,21MeshT::Type{<:Union{StructuredMesh{2},22UnstructuredMesh2D,23P4estMesh{2}}},24have_nonconservative_terms::False,25equations::CompressibleEulerEquations2D,26volume_flux::typeof(flux_shima_etal_turbo),27dg::DGSEM, cache, alpha)28@unpack derivative_split = dg.basis29@unpack contravariant_vectors = cache.elements3031# Create a temporary array that will be used to store the RHS with permuted32# indices `[i, j, v]` to allow using SIMD instructions.33# `StrideArray`s with purely static dimensions do not allocate on the heap.34du = StrideArray{eltype(u_cons)}(undef,35(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,36StaticInt(nvariables(equations))))3738# Convert conserved to primitive variables on the given `element`.39u_prim = StrideArray{eltype(u_cons)}(undef,40(ntuple(_ -> StaticInt(nnodes(dg)),41ndims(MeshT))...,42StaticInt(nvariables(equations))))4344@turbo for j in eachnode(dg), i in eachnode(dg)45rho = u_cons[1, i, j, element]46rho_v1 = u_cons[2, i, j, element]47rho_v2 = u_cons[3, i, j, element]48rho_e_total = u_cons[4, i, j, element]4950v1 = rho_v1 / rho51v2 = rho_v2 / rho52p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))5354u_prim[i, j, 1] = rho55u_prim[i, j, 2] = v156u_prim[i, j, 3] = v257u_prim[i, j, 4] = p58end5960# x direction61# At first, we create new temporary arrays with permuted memory layout to62# allow using SIMD instructions along the first dimension (which is contiguous63# in memory).64du_permuted = StrideArray{eltype(u_cons)}(undef,65(StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),66StaticInt(nvariables(equations))))6768u_prim_permuted = StrideArray{eltype(u_cons)}(undef,69(StaticInt(nnodes(dg)),70StaticInt(nnodes(dg)),71StaticInt(nvariables(equations))))7273@turbo for v in eachvariable(equations),74j in eachnode(dg),75i in eachnode(dg)7677u_prim_permuted[j, i, v] = u_prim[i, j, v]78end79fill!(du_permuted, zero(eltype(du_permuted)))8081# We must also permute the contravariant vectors.82contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,83(StaticInt(nnodes(dg)),84StaticInt(nnodes(dg)),85StaticInt(ndims(MeshT))))8687@turbo for j in eachnode(dg), i in eachnode(dg)88contravariant_vectors_x[j, i, 1] = contravariant_vectors[1, 1, i, j, element]89contravariant_vectors_x[j, i, 2] = contravariant_vectors[2, 1, i, j, element]90end9192# Next, we basically inline the volume flux. To allow SIMD vectorization and93# still use the symmetry of the volume flux and the derivative matrix, we94# loop over the triangular part in an outer loop and use a plain inner loop.95for i in eachnode(dg), ii in (i + 1):nnodes(dg)96@turbo for j in eachnode(dg)97rho_ll = u_prim_permuted[j, i, 1]98v1_ll = u_prim_permuted[j, i, 2]99v2_ll = u_prim_permuted[j, i, 3]100p_ll = u_prim_permuted[j, i, 4]101102rho_rr = u_prim_permuted[j, ii, 1]103v1_rr = u_prim_permuted[j, ii, 2]104v2_rr = u_prim_permuted[j, ii, 3]105p_rr = u_prim_permuted[j, ii, 4]106107normal_direction_1 = 0.5 * (contravariant_vectors_x[j, i, 1] +108contravariant_vectors_x[j, ii, 1])109normal_direction_2 = 0.5 * (contravariant_vectors_x[j, i, 2] +110contravariant_vectors_x[j, ii, 2])111112v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2113v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2114115# Compute required mean values116rho_avg = 0.5 * (rho_ll + rho_rr)117v1_avg = 0.5 * (v1_ll + v1_rr)118v2_avg = 0.5 * (v2_ll + v2_rr)119v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)120p_avg = 0.5 * (p_ll + p_rr)121velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)122123# Calculate fluxes depending on normal_direction124f1 = rho_avg * v_dot_n_avg125f2 = f1 * v1_avg + p_avg * normal_direction_1126f3 = f1 * v2_avg + p_avg * normal_direction_2127f4 = (f1 * velocity_square_avg +128p_avg * v_dot_n_avg * equations.inv_gamma_minus_one129+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))130131# Add scaled fluxes to RHS132factor_i = alpha * derivative_split[i, ii]133du_permuted[j, i, 1] += factor_i * f1134du_permuted[j, i, 2] += factor_i * f2135du_permuted[j, i, 3] += factor_i * f3136du_permuted[j, i, 4] += factor_i * f4137138factor_ii = alpha * derivative_split[ii, i]139du_permuted[j, ii, 1] += factor_ii * f1140du_permuted[j, ii, 2] += factor_ii * f2141du_permuted[j, ii, 3] += factor_ii * f3142du_permuted[j, ii, 4] += factor_ii * f4143end144end145146@turbo for v in eachvariable(equations),147j in eachnode(dg),148i in eachnode(dg)149150du[i, j, v] = du_permuted[j, i, v]151end152153# y direction154# We must also permute the contravariant vectors.155contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,156(StaticInt(nnodes(dg)),157StaticInt(nnodes(dg)),158StaticInt(ndims(MeshT))))159160@turbo for j in eachnode(dg), i in eachnode(dg)161contravariant_vectors_y[i, j, 1] = contravariant_vectors[1, 2, i, j, element]162contravariant_vectors_y[i, j, 2] = contravariant_vectors[2, 2, i, j, element]163end164165# The memory layout is already optimal for SIMD vectorization in this loop.166for j in eachnode(dg), jj in (j + 1):nnodes(dg)167@turbo for i in eachnode(dg)168rho_ll = u_prim[i, j, 1]169v1_ll = u_prim[i, j, 2]170v2_ll = u_prim[i, j, 3]171p_ll = u_prim[i, j, 4]172173rho_rr = u_prim[i, jj, 1]174v1_rr = u_prim[i, jj, 2]175v2_rr = u_prim[i, jj, 3]176p_rr = u_prim[i, jj, 4]177178normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, 1] +179contravariant_vectors_y[i, jj, 1])180normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, 2] +181contravariant_vectors_y[i, jj, 2])182183v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2184v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2185186# Compute required mean values187rho_avg = 0.5 * (rho_ll + rho_rr)188v1_avg = 0.5 * (v1_ll + v1_rr)189v2_avg = 0.5 * (v2_ll + v2_rr)190v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)191p_avg = 0.5 * (p_ll + p_rr)192velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)193194# Calculate fluxes depending on normal_direction195f1 = rho_avg * v_dot_n_avg196f2 = f1 * v1_avg + p_avg * normal_direction_1197f3 = f1 * v2_avg + p_avg * normal_direction_2198f4 = (f1 * velocity_square_avg +199p_avg * v_dot_n_avg * equations.inv_gamma_minus_one200+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))201202# Add scaled fluxes to RHS203factor_j = alpha * derivative_split[j, jj]204du[i, j, 1] += factor_j * f1205du[i, j, 2] += factor_j * f2206du[i, j, 3] += factor_j * f3207du[i, j, 4] += factor_j * f4208209factor_jj = alpha * derivative_split[jj, j]210du[i, jj, 1] += factor_jj * f1211du[i, jj, 2] += factor_jj * f2212du[i, jj, 3] += factor_jj * f3213du[i, jj, 4] += factor_jj * f4214end215end216217# Finally, we add the temporary RHS computed here to the global RHS in the218# given `element`.219@turbo for v in eachvariable(equations),220j in eachnode(dg),221i in eachnode(dg)222223_du[v, i, j, element] += du[i, j, v]224end225end226227@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,228element,229MeshT::Type{<:Union{StructuredMesh{2},230UnstructuredMesh2D,231P4estMesh{2}}},232have_nonconservative_terms::False,233equations::CompressibleEulerEquations2D,234volume_flux::typeof(flux_ranocha_turbo),235dg::DGSEM, cache, alpha)236@unpack derivative_split = dg.basis237@unpack contravariant_vectors = cache.elements238239# Create a temporary array that will be used to store the RHS with permuted240# indices `[i, j, v]` to allow using SIMD instructions.241# `StrideArray`s with purely static dimensions do not allocate on the heap.242du = StrideArray{eltype(u_cons)}(undef,243(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,244StaticInt(nvariables(equations))))245246# Convert conserved to primitive variables on the given `element`. In addition247# to the usual primitive variables, we also compute logarithms of the density248# and pressure to increase the performance of the required logarithmic mean249# values.250u_prim = StrideArray{eltype(u_cons)}(undef,251(ntuple(_ -> StaticInt(nnodes(dg)),252ndims(MeshT))...,253StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs254255@turbo for j in eachnode(dg), i in eachnode(dg)256rho = u_cons[1, i, j, element]257rho_v1 = u_cons[2, i, j, element]258rho_v2 = u_cons[3, i, j, element]259rho_e_total = u_cons[4, i, j, element]260261v1 = rho_v1 / rho262v2 = rho_v2 / rho263p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))264265u_prim[i, j, 1] = rho266u_prim[i, j, 2] = v1267u_prim[i, j, 3] = v2268u_prim[i, j, 4] = p269u_prim[i, j, 5] = log(rho)270u_prim[i, j, 6] = log(p)271end272273# x direction274# At first, we create new temporary arrays with permuted memory layout to275# allow using SIMD instructions along the first dimension (which is contiguous276# in memory).277du_permuted = StrideArray{eltype(u_cons)}(undef,278(StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),279StaticInt(nvariables(equations))))280281u_prim_permuted = StrideArray{eltype(u_cons)}(undef,282(StaticInt(nnodes(dg)),283StaticInt(nnodes(dg)),284StaticInt(nvariables(equations) + 2)))285286@turbo for v in indices(u_prim, 3), # v in eachvariable(equations) misses +2 logs287j in eachnode(dg),288i in eachnode(dg)289290u_prim_permuted[j, i, v] = u_prim[i, j, v]291end292fill!(du_permuted, zero(eltype(du_permuted)))293294# We must also permute the contravariant vectors.295contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,296(StaticInt(nnodes(dg)),297StaticInt(nnodes(dg)),298StaticInt(ndims(MeshT))))299300@turbo for j in eachnode(dg), i in eachnode(dg)301contravariant_vectors_x[j, i, 1] = contravariant_vectors[1, 1, i, j, element]302contravariant_vectors_x[j, i, 2] = contravariant_vectors[2, 1, i, j, element]303end304305# Next, we basically inline the volume flux. To allow SIMD vectorization and306# still use the symmetry of the volume flux and the derivative matrix, we307# loop over the triangular part in an outer loop and use a plain inner loop.308for i in eachnode(dg), ii in (i + 1):nnodes(dg)309@turbo for j in eachnode(dg)310rho_ll = u_prim_permuted[j, i, 1]311v1_ll = u_prim_permuted[j, i, 2]312v2_ll = u_prim_permuted[j, i, 3]313p_ll = u_prim_permuted[j, i, 4]314log_rho_ll = u_prim_permuted[j, i, 5]315log_p_ll = u_prim_permuted[j, i, 6]316317rho_rr = u_prim_permuted[j, ii, 1]318v1_rr = u_prim_permuted[j, ii, 2]319v2_rr = u_prim_permuted[j, ii, 3]320p_rr = u_prim_permuted[j, ii, 4]321log_rho_rr = u_prim_permuted[j, ii, 5]322log_p_rr = u_prim_permuted[j, ii, 6]323324normal_direction_1 = 0.5 * (contravariant_vectors_x[j, i, 1] +325contravariant_vectors_x[j, ii, 1])326normal_direction_2 = 0.5 * (contravariant_vectors_x[j, i, 2] +327contravariant_vectors_x[j, ii, 2])328329v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2330v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2331332# Compute required mean values333# We inline the logarithmic mean to allow LoopVectorization.jl to optimize334# it efficiently. This is equivalent to335# rho_mean = ln_mean(rho_ll, rho_rr)336x1 = rho_ll337log_x1 = log_rho_ll338y1 = rho_rr339log_y1 = log_rho_rr340x1_plus_y1 = x1 + y1341y1_minus_x1 = y1 - x1342z1 = y1_minus_x1^2 / x1_plus_y1^2343special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))344regular_path1 = y1_minus_x1 / (log_y1 - log_x1)345rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)346347# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`348# in exact arithmetic since349# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)350# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)351# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)352x2 = rho_ll * p_rr353log_x2 = log_rho_ll + log_p_rr354y2 = rho_rr * p_ll355log_y2 = log_rho_rr + log_p_ll356x2_plus_y2 = x2 + y2357y2_minus_x2 = y2 - x2358z2 = y2_minus_x2^2 / x2_plus_y2^2359special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2360regular_path2 = (log_y2 - log_x2) / y2_minus_x2361inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)362363v1_avg = 0.5 * (v1_ll + v1_rr)364v2_avg = 0.5 * (v2_ll + v2_rr)365p_avg = 0.5 * (p_ll + p_rr)366velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)367368# Calculate fluxes depending on normal_direction369f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)370f2 = f1 * v1_avg + p_avg * normal_direction_1371f3 = f1 * v2_avg + p_avg * normal_direction_2372f4 = (f1 *373(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)374+3750.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))376377# Add scaled fluxes to RHS378factor_i = alpha * derivative_split[i, ii]379du_permuted[j, i, 1] += factor_i * f1380du_permuted[j, i, 2] += factor_i * f2381du_permuted[j, i, 3] += factor_i * f3382du_permuted[j, i, 4] += factor_i * f4383384factor_ii = alpha * derivative_split[ii, i]385du_permuted[j, ii, 1] += factor_ii * f1386du_permuted[j, ii, 2] += factor_ii * f2387du_permuted[j, ii, 3] += factor_ii * f3388du_permuted[j, ii, 4] += factor_ii * f4389end390end391392@turbo for v in eachvariable(equations),393j in eachnode(dg),394i in eachnode(dg)395396du[i, j, v] = du_permuted[j, i, v]397end398399# y direction400# We must also permute the contravariant vectors.401contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,402(StaticInt(nnodes(dg)),403StaticInt(nnodes(dg)),404StaticInt(ndims(MeshT))))405406@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)407contravariant_vectors_y[i, j, 1] = contravariant_vectors[1, 2, i, j, element]408contravariant_vectors_y[i, j, 2] = contravariant_vectors[2, 2, i, j, element]409end410411# The memory layout is already optimal for SIMD vectorization in this loop.412for j in eachnode(dg), jj in (j + 1):nnodes(dg)413@turbo for i in eachnode(dg)414rho_ll = u_prim[i, j, 1]415v1_ll = u_prim[i, j, 2]416v2_ll = u_prim[i, j, 3]417p_ll = u_prim[i, j, 4]418log_rho_ll = u_prim[i, j, 5]419log_p_ll = u_prim[i, j, 6]420421rho_rr = u_prim[i, jj, 1]422v1_rr = u_prim[i, jj, 2]423v2_rr = u_prim[i, jj, 3]424p_rr = u_prim[i, jj, 4]425log_rho_rr = u_prim[i, jj, 5]426log_p_rr = u_prim[i, jj, 6]427428normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, 1] +429contravariant_vectors_y[i, jj, 1])430normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, 2] +431contravariant_vectors_y[i, jj, 2])432433v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2434v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2435436# Compute required mean values437# We inline the logarithmic mean to allow LoopVectorization.jl to optimize438# it efficiently. This is equivalent to439# rho_mean = ln_mean(rho_ll, rho_rr)440x1 = rho_ll441log_x1 = log_rho_ll442y1 = rho_rr443log_y1 = log_rho_rr444x1_plus_y1 = x1 + y1445y1_minus_x1 = y1 - x1446z1 = y1_minus_x1^2 / x1_plus_y1^2447special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))448regular_path1 = y1_minus_x1 / (log_y1 - log_x1)449rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)450451# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`452# in exact arithmetic since453# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)454# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)455# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)456x2 = rho_ll * p_rr457log_x2 = log_rho_ll + log_p_rr458y2 = rho_rr * p_ll459log_y2 = log_rho_rr + log_p_ll460x2_plus_y2 = x2 + y2461y2_minus_x2 = y2 - x2462z2 = y2_minus_x2^2 / x2_plus_y2^2463special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2464regular_path2 = (log_y2 - log_x2) / y2_minus_x2465inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)466467v1_avg = 0.5 * (v1_ll + v1_rr)468v2_avg = 0.5 * (v2_ll + v2_rr)469p_avg = 0.5 * (p_ll + p_rr)470velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)471472# Calculate fluxes depending on normal_direction473f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)474f2 = f1 * v1_avg + p_avg * normal_direction_1475f3 = f1 * v2_avg + p_avg * normal_direction_2476f4 = (f1 *477(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)478+4790.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))480481# Add scaled fluxes to RHS482factor_j = alpha * derivative_split[j, jj]483du[i, j, 1] += factor_j * f1484du[i, j, 2] += factor_j * f2485du[i, j, 3] += factor_j * f3486du[i, j, 4] += factor_j * f4487488factor_jj = alpha * derivative_split[jj, j]489du[i, jj, 1] += factor_jj * f1490du[i, jj, 2] += factor_jj * f2491du[i, jj, 3] += factor_jj * f3492du[i, jj, 4] += factor_jj * f4493end494end495496# Finally, we add the temporary RHS computed here to the global RHS in the497# given `element`.498@turbo for v in eachvariable(equations),499j in eachnode(dg),500i in eachnode(dg)501502_du[v, i, j, element] += du[i, j, v]503end504end505506507