Path: blob/main/src/solvers/dgsem_tree/dg_3d_compressible_euler.jl
5591 views
1# From here on, this file contains specializations of DG methods on the2# TreeMesh3D to the compressible Euler equations.3#4# The specialized methods contain relatively verbose and ugly code in the sense5# that we do not use the common "pointwise physics" interface of Trixi.jl.6# However, this is currently (November 2021) necessary to get a significant7# speed-up by using SIMD instructions via LoopVectorization.jl.8#9# TODO: SIMD. We could get even more speed-up if we did not need to permute10# array dimensions, i.e., if we changed the basic memory layout.11#12# We do not wrap this code in `@muladd begin ... end` block. Optimizations like13# this are handled automatically by LoopVectorization.jl.1415# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`16# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`17# works efficiently here.18@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,19element, MeshT::Type{<:TreeMesh{3}},20have_nonconservative_terms::False,21equations::CompressibleEulerEquations3D,22volume_flux::typeof(flux_shima_etal_turbo),23dg::DGSEM, cache, alpha)24@unpack derivative_split = dg.basis2526# Create a temporary array that will be used to store the RHS with permuted27# indices `[i, j, k, v]` to allow using SIMD instructions.28# `StrideArray`s with purely static dimensions do not allocate on the heap.29du = StrideArray{eltype(u_cons)}(undef,30(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,31StaticInt(nvariables(equations))))3233# Convert conserved to primitive variables on the given `element`.34u_prim = StrideArray{eltype(u_cons)}(undef,35(ntuple(_ -> StaticInt(nnodes(dg)),36ndims(MeshT))...,37StaticInt(nvariables(equations))))3839@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)40rho = u_cons[1, i, j, k, element]41rho_v1 = u_cons[2, i, j, k, element]42rho_v2 = u_cons[3, i, j, k, element]43rho_v3 = u_cons[4, i, j, k, element]44rho_e_total = u_cons[5, i, j, k, element]4546v1 = rho_v1 / rho47v2 = rho_v2 / rho48v3 = rho_v3 / rho49p = (equations.gamma - 1) *50(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))5152u_prim[i, j, k, 1] = rho53u_prim[i, j, k, 2] = v154u_prim[i, j, k, 3] = v255u_prim[i, j, k, 4] = v356u_prim[i, j, k, 5] = p57end5859# x direction60# At first, we create new temporary arrays with permuted memory layout to61# allow using SIMD instructions along the first dimension (which is contiguous62# in memory).63du_permuted = StrideArray{eltype(u_cons)}(undef,64(StaticInt(nnodes(dg)^2),65StaticInt(nnodes(dg)),66StaticInt(nvariables(equations))))6768u_prim_permuted = StrideArray{eltype(u_cons)}(undef,69(StaticInt(nnodes(dg)^2),70StaticInt(nnodes(dg)),71StaticInt(nvariables(equations))))7273@turbo for v in eachvariable(equations),74k in eachnode(dg),75j in eachnode(dg),76i in eachnode(dg)7778jk = j + nnodes(dg) * (k - 1)79u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]80end81fill!(du_permuted, zero(eltype(du_permuted)))8283# Next, we basically inline the volume flux. To allow SIMD vectorization and84# still use the symmetry of the volume flux and the derivative matrix, we85# loop over the triangular part in an outer loop and use a plain inner loop.86for i in eachnode(dg), ii in (i + 1):nnodes(dg)87@turbo for jk in Base.OneTo(nnodes(dg)^2)88rho_ll = u_prim_permuted[jk, i, 1]89v1_ll = u_prim_permuted[jk, i, 2]90v2_ll = u_prim_permuted[jk, i, 3]91v3_ll = u_prim_permuted[jk, i, 4]92p_ll = u_prim_permuted[jk, i, 5]9394rho_rr = u_prim_permuted[jk, ii, 1]95v1_rr = u_prim_permuted[jk, ii, 2]96v2_rr = u_prim_permuted[jk, ii, 3]97v3_rr = u_prim_permuted[jk, ii, 4]98p_rr = u_prim_permuted[jk, ii, 5]99100# Compute required mean values101rho_avg = 0.5 * (rho_ll + rho_rr)102v1_avg = 0.5 * (v1_ll + v1_rr)103v2_avg = 0.5 * (v2_ll + v2_rr)104v3_avg = 0.5 * (v3_ll + v3_rr)105p_avg = 0.5 * (p_ll + p_rr)106kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)107pv1_avg = 0.5 * (p_ll * v1_rr + p_rr * v1_ll)108109# Calculate fluxes depending on Cartesian orientation110f1 = rho_avg * v1_avg111f2 = f1 * v1_avg + p_avg112f3 = f1 * v2_avg113f4 = f1 * v3_avg114f5 = p_avg * v1_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv1_avg115116# Add scaled fluxes to RHS117factor_i = alpha * derivative_split[i, ii]118du_permuted[jk, i, 1] += factor_i * f1119du_permuted[jk, i, 2] += factor_i * f2120du_permuted[jk, i, 3] += factor_i * f3121du_permuted[jk, i, 4] += factor_i * f4122du_permuted[jk, i, 5] += factor_i * f5123124factor_ii = alpha * derivative_split[ii, i]125du_permuted[jk, ii, 1] += factor_ii * f1126du_permuted[jk, ii, 2] += factor_ii * f2127du_permuted[jk, ii, 3] += factor_ii * f3128du_permuted[jk, ii, 4] += factor_ii * f4129du_permuted[jk, ii, 5] += factor_ii * f5130end131end132133@turbo for v in eachvariable(equations),134k in eachnode(dg),135j in eachnode(dg),136i in eachnode(dg)137138jk = j + nnodes(dg) * (k - 1)139du[i, j, k, v] = du_permuted[jk, i, v]140end141142# y direction143# A possible permutation of array dimensions with improved opportunities for144# SIMD vectorization appeared to be slower than the direct version used here145# in preliminary numerical experiments on an AVX2 system.146for j in eachnode(dg), jj in (j + 1):nnodes(dg)147@turbo for k in eachnode(dg), i in eachnode(dg)148rho_ll = u_prim[i, j, k, 1]149v1_ll = u_prim[i, j, k, 2]150v2_ll = u_prim[i, j, k, 3]151v3_ll = u_prim[i, j, k, 4]152p_ll = u_prim[i, j, k, 5]153154rho_rr = u_prim[i, jj, k, 1]155v1_rr = u_prim[i, jj, k, 2]156v2_rr = u_prim[i, jj, k, 3]157v3_rr = u_prim[i, jj, k, 4]158p_rr = u_prim[i, jj, k, 5]159160# Compute required mean values161rho_avg = 0.5 * (rho_ll + rho_rr)162v1_avg = 0.5 * (v1_ll + v1_rr)163v2_avg = 0.5 * (v2_ll + v2_rr)164v3_avg = 0.5 * (v3_ll + v3_rr)165p_avg = 0.5 * (p_ll + p_rr)166kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)167pv2_avg = 0.5 * (p_ll * v2_rr + p_rr * v2_ll)168169# Calculate fluxes depending on Cartesian orientation170f1 = rho_avg * v2_avg171f2 = f1 * v1_avg172f3 = f1 * v2_avg + p_avg173f4 = f1 * v3_avg174f5 = p_avg * v2_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv2_avg175176# Add scaled fluxes to RHS177factor_j = alpha * derivative_split[j, jj]178du[i, j, k, 1] += factor_j * f1179du[i, j, k, 2] += factor_j * f2180du[i, j, k, 3] += factor_j * f3181du[i, j, k, 4] += factor_j * f4182du[i, j, k, 5] += factor_j * f5183184factor_jj = alpha * derivative_split[jj, j]185du[i, jj, k, 1] += factor_jj * f1186du[i, jj, k, 2] += factor_jj * f2187du[i, jj, k, 3] += factor_jj * f3188du[i, jj, k, 4] += factor_jj * f4189du[i, jj, k, 5] += factor_jj * f5190end191end192193# z direction194# The memory layout is already optimal for SIMD vectorization in this loop.195# We just squeeze the first two dimensions to make the code slightly faster.196GC.@preserve u_prim begin197u_prim_reshaped = PtrArray(pointer(u_prim),198(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),199StaticInt(nvariables(equations))))200201du_reshaped = PtrArray(pointer(du),202(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),203StaticInt(nvariables(equations))))204205for k in eachnode(dg), kk in (k + 1):nnodes(dg)206@turbo for ij in Base.OneTo(nnodes(dg)^2)207rho_ll = u_prim_reshaped[ij, k, 1]208v1_ll = u_prim_reshaped[ij, k, 2]209v2_ll = u_prim_reshaped[ij, k, 3]210v3_ll = u_prim_reshaped[ij, k, 4]211p_ll = u_prim_reshaped[ij, k, 5]212213rho_rr = u_prim_reshaped[ij, kk, 1]214v1_rr = u_prim_reshaped[ij, kk, 2]215v2_rr = u_prim_reshaped[ij, kk, 3]216v3_rr = u_prim_reshaped[ij, kk, 4]217p_rr = u_prim_reshaped[ij, kk, 5]218219# Compute required mean values220rho_avg = 0.5 * (rho_ll + rho_rr)221v1_avg = 0.5 * (v1_ll + v1_rr)222v2_avg = 0.5 * (v2_ll + v2_rr)223v3_avg = 0.5 * (v3_ll + v3_rr)224p_avg = 0.5 * (p_ll + p_rr)225kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)226pv3_avg = 0.5 * (p_ll * v3_rr + p_rr * v3_ll)227228# Calculate fluxes depending on Cartesian orientation229f1 = rho_avg * v3_avg230f2 = f1 * v1_avg231f3 = f1 * v2_avg232f4 = f1 * v3_avg + p_avg233f5 = p_avg * v3_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv3_avg234235# Add scaled fluxes to RHS236factor_k = alpha * derivative_split[k, kk]237du_reshaped[ij, k, 1] += factor_k * f1238du_reshaped[ij, k, 2] += factor_k * f2239du_reshaped[ij, k, 3] += factor_k * f3240du_reshaped[ij, k, 4] += factor_k * f4241du_reshaped[ij, k, 5] += factor_k * f5242243factor_kk = alpha * derivative_split[kk, k]244du_reshaped[ij, kk, 1] += factor_kk * f1245du_reshaped[ij, kk, 2] += factor_kk * f2246du_reshaped[ij, kk, 3] += factor_kk * f3247du_reshaped[ij, kk, 4] += factor_kk * f4248du_reshaped[ij, kk, 5] += factor_kk * f5249end250end251end # GC.@preserve u_prim252253# Finally, we add the temporary RHS computed here to the global RHS in the254# given `element`.255@turbo for v in eachvariable(equations),256k in eachnode(dg),257j in eachnode(dg),258i in eachnode(dg)259260_du[v, i, j, k, element] += du[i, j, k, v]261end262end263264@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,265element, MeshT::Type{<:TreeMesh{3}},266have_nonconservative_terms::False,267equations::CompressibleEulerEquations3D,268volume_flux::typeof(flux_ranocha_turbo),269dg::DGSEM, cache, alpha)270@unpack derivative_split = dg.basis271272# Create a temporary array that will be used to store the RHS with permuted273# indices `[i, j, k, v]` to allow using SIMD instructions.274# `StrideArray`s with purely static dimensions do not allocate on the heap.275du = StrideArray{eltype(u_cons)}(undef,276(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,277StaticInt(nvariables(equations))))278279# Convert conserved to primitive variables on the given `element`. In addition280# to the usual primitive variables, we also compute logarithms of the density281# and pressure to increase the performance of the required logarithmic mean282# values.283u_prim = StrideArray{eltype(u_cons)}(undef,284(ntuple(_ -> StaticInt(nnodes(dg)),285ndims(MeshT))...,286StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs287288@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)289rho = u_cons[1, i, j, k, element]290rho_v1 = u_cons[2, i, j, k, element]291rho_v2 = u_cons[3, i, j, k, element]292rho_v3 = u_cons[4, i, j, k, element]293rho_e_total = u_cons[5, i, j, k, element]294295v1 = rho_v1 / rho296v2 = rho_v2 / rho297v3 = rho_v3 / rho298p = (equations.gamma - 1) *299(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))300301u_prim[i, j, k, 1] = rho302u_prim[i, j, k, 2] = v1303u_prim[i, j, k, 3] = v2304u_prim[i, j, k, 4] = v3305u_prim[i, j, k, 5] = p306u_prim[i, j, k, 6] = log(rho)307u_prim[i, j, k, 7] = log(p)308end309310# x direction311# At first, we create new temporary arrays with permuted memory layout to312# allow using SIMD instructions along the first dimension (which is contiguous313# in memory).314du_permuted = StrideArray{eltype(u_cons)}(undef,315(StaticInt(nnodes(dg)^2),316StaticInt(nnodes(dg)),317StaticInt(nvariables(equations))))318319u_prim_permuted = StrideArray{eltype(u_cons)}(undef,320(StaticInt(nnodes(dg)^2),321StaticInt(nnodes(dg)),322StaticInt(nvariables(equations) + 2)))323324@turbo for v in indices(u_prim, 4), # v in eachvariable(equations) misses +2 logs325k in eachnode(dg),326j in eachnode(dg),327i in eachnode(dg)328329jk = j + nnodes(dg) * (k - 1)330u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]331end332fill!(du_permuted, zero(eltype(du_permuted)))333334# Next, we basically inline the volume flux. To allow SIMD vectorization and335# still use the symmetry of the volume flux and the derivative matrix, we336# loop over the triangular part in an outer loop and use a plain inner loop.337for i in eachnode(dg), ii in (i + 1):nnodes(dg)338@turbo for jk in Base.OneTo(nnodes(dg)^2)339rho_ll = u_prim_permuted[jk, i, 1]340v1_ll = u_prim_permuted[jk, i, 2]341v2_ll = u_prim_permuted[jk, i, 3]342v3_ll = u_prim_permuted[jk, i, 4]343p_ll = u_prim_permuted[jk, i, 5]344log_rho_ll = u_prim_permuted[jk, i, 6]345log_p_ll = u_prim_permuted[jk, i, 7]346347rho_rr = u_prim_permuted[jk, ii, 1]348v1_rr = u_prim_permuted[jk, ii, 2]349v2_rr = u_prim_permuted[jk, ii, 3]350v3_rr = u_prim_permuted[jk, ii, 4]351p_rr = u_prim_permuted[jk, ii, 5]352log_rho_rr = u_prim_permuted[jk, ii, 6]353log_p_rr = u_prim_permuted[jk, ii, 7]354355# Compute required mean values356# We inline the logarithmic mean to allow LoopVectorization.jl to optimize357# it efficiently. This is equivalent to358# rho_mean = ln_mean(rho_ll, rho_rr)359x1 = rho_ll360log_x1 = log_rho_ll361y1 = rho_rr362log_y1 = log_rho_rr363x1_plus_y1 = x1 + y1364y1_minus_x1 = y1 - x1365z1 = y1_minus_x1^2 / x1_plus_y1^2366special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))367regular_path1 = y1_minus_x1 / (log_y1 - log_x1)368rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)369370# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`371# in exact arithmetic since372# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)373# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)374# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)375x2 = rho_ll * p_rr376log_x2 = log_rho_ll + log_p_rr377y2 = rho_rr * p_ll378log_y2 = log_rho_rr + log_p_ll379x2_plus_y2 = x2 + y2380y2_minus_x2 = y2 - x2381z2 = y2_minus_x2^2 / x2_plus_y2^2382special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2383regular_path2 = (log_y2 - log_x2) / y2_minus_x2384inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)385386v1_avg = 0.5 * (v1_ll + v1_rr)387v2_avg = 0.5 * (v2_ll + v2_rr)388v3_avg = 0.5 * (v3_ll + v3_rr)389p_avg = 0.5 * (p_ll + p_rr)390velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)391392# Calculate fluxes depending on Cartesian orientation393f1 = rho_mean * v1_avg394f2 = f1 * v1_avg + p_avg395f3 = f1 * v2_avg396f4 = f1 * v3_avg397f5 = f1 *398(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +3990.5 * (p_ll * v1_rr + p_rr * v1_ll)400401# Add scaled fluxes to RHS402factor_i = alpha * derivative_split[i, ii]403du_permuted[jk, i, 1] += factor_i * f1404du_permuted[jk, i, 2] += factor_i * f2405du_permuted[jk, i, 3] += factor_i * f3406du_permuted[jk, i, 4] += factor_i * f4407du_permuted[jk, i, 5] += factor_i * f5408409factor_ii = alpha * derivative_split[ii, i]410du_permuted[jk, ii, 1] += factor_ii * f1411du_permuted[jk, ii, 2] += factor_ii * f2412du_permuted[jk, ii, 3] += factor_ii * f3413du_permuted[jk, ii, 4] += factor_ii * f4414du_permuted[jk, ii, 5] += factor_ii * f5415end416end417418@turbo for v in eachvariable(equations),419k in eachnode(dg),420j in eachnode(dg),421i in eachnode(dg)422423jk = j + nnodes(dg) * (k - 1)424du[i, j, k, v] = du_permuted[jk, i, v]425end426427# y direction428# A possible permutation of array dimensions with improved opportunities for429# SIMD vectorization appeared to be slower than the direct version used here430# in preliminary numerical experiments on an AVX2 system.431for j in eachnode(dg), jj in (j + 1):nnodes(dg)432@turbo for k in eachnode(dg), i in eachnode(dg)433rho_ll = u_prim[i, j, k, 1]434v1_ll = u_prim[i, j, k, 2]435v2_ll = u_prim[i, j, k, 3]436v3_ll = u_prim[i, j, k, 4]437p_ll = u_prim[i, j, k, 5]438log_rho_ll = u_prim[i, j, k, 6]439log_p_ll = u_prim[i, j, k, 7]440441rho_rr = u_prim[i, jj, k, 1]442v1_rr = u_prim[i, jj, k, 2]443v2_rr = u_prim[i, jj, k, 3]444v3_rr = u_prim[i, jj, k, 4]445p_rr = u_prim[i, jj, k, 5]446log_rho_rr = u_prim[i, jj, k, 6]447log_p_rr = u_prim[i, jj, k, 7]448449# Compute required mean values450# We inline the logarithmic mean to allow LoopVectorization.jl to optimize451# it efficiently. This is equivalent to452# rho_mean = ln_mean(rho_ll, rho_rr)453x1 = rho_ll454log_x1 = log_rho_ll455y1 = rho_rr456log_y1 = log_rho_rr457x1_plus_y1 = x1 + y1458y1_minus_x1 = y1 - x1459z1 = y1_minus_x1^2 / x1_plus_y1^2460special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))461regular_path1 = y1_minus_x1 / (log_y1 - log_x1)462rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)463464# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`465# in exact arithmetic since466# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)467# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)468# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)469x2 = rho_ll * p_rr470log_x2 = log_rho_ll + log_p_rr471y2 = rho_rr * p_ll472log_y2 = log_rho_rr + log_p_ll473x2_plus_y2 = x2 + y2474y2_minus_x2 = y2 - x2475z2 = y2_minus_x2^2 / x2_plus_y2^2476special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2477regular_path2 = (log_y2 - log_x2) / y2_minus_x2478inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)479480v1_avg = 0.5 * (v1_ll + v1_rr)481v2_avg = 0.5 * (v2_ll + v2_rr)482v3_avg = 0.5 * (v3_ll + v3_rr)483p_avg = 0.5 * (p_ll + p_rr)484velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)485486# Calculate fluxes depending on Cartesian orientation487f1 = rho_mean * v2_avg488f2 = f1 * v1_avg489f3 = f1 * v2_avg + p_avg490f4 = f1 * v3_avg491f5 = f1 *492(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +4930.5 * (p_ll * v2_rr + p_rr * v2_ll)494495# Add scaled fluxes to RHS496factor_j = alpha * derivative_split[j, jj]497du[i, j, k, 1] += factor_j * f1498du[i, j, k, 2] += factor_j * f2499du[i, j, k, 3] += factor_j * f3500du[i, j, k, 4] += factor_j * f4501du[i, j, k, 5] += factor_j * f5502503factor_jj = alpha * derivative_split[jj, j]504du[i, jj, k, 1] += factor_jj * f1505du[i, jj, k, 2] += factor_jj * f2506du[i, jj, k, 3] += factor_jj * f3507du[i, jj, k, 4] += factor_jj * f4508du[i, jj, k, 5] += factor_jj * f5509end510end511512# z direction513# The memory layout is already optimal for SIMD vectorization in this loop.514# We just squeeze the first two dimensions to make the code slightly faster.515GC.@preserve u_prim begin516u_prim_reshaped = PtrArray(pointer(u_prim),517(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),518StaticInt(nvariables(equations) + 2)))519520du_reshaped = PtrArray(pointer(du),521(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),522StaticInt(nvariables(equations))))523524for k in eachnode(dg), kk in (k + 1):nnodes(dg)525@turbo for ij in Base.OneTo(nnodes(dg)^2)526rho_ll = u_prim_reshaped[ij, k, 1]527v1_ll = u_prim_reshaped[ij, k, 2]528v2_ll = u_prim_reshaped[ij, k, 3]529v3_ll = u_prim_reshaped[ij, k, 4]530p_ll = u_prim_reshaped[ij, k, 5]531log_rho_ll = u_prim_reshaped[ij, k, 6]532log_p_ll = u_prim_reshaped[ij, k, 7]533534rho_rr = u_prim_reshaped[ij, kk, 1]535v1_rr = u_prim_reshaped[ij, kk, 2]536v2_rr = u_prim_reshaped[ij, kk, 3]537v3_rr = u_prim_reshaped[ij, kk, 4]538p_rr = u_prim_reshaped[ij, kk, 5]539log_rho_rr = u_prim_reshaped[ij, kk, 6]540log_p_rr = u_prim_reshaped[ij, kk, 7]541542# Compute required mean values543# We inline the logarithmic mean to allow LoopVectorization.jl to optimize544# it efficiently. This is equivalent to545# rho_mean = ln_mean(rho_ll, rho_rr)546x1 = rho_ll547log_x1 = log_rho_ll548y1 = rho_rr549log_y1 = log_rho_rr550x1_plus_y1 = x1 + y1551y1_minus_x1 = y1 - x1552z1 = y1_minus_x1^2 / x1_plus_y1^2553special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))554regular_path1 = y1_minus_x1 / (log_y1 - log_x1)555rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)556557# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`558# in exact arithmetic since559# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)560# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)561# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)562x2 = rho_ll * p_rr563log_x2 = log_rho_ll + log_p_rr564y2 = rho_rr * p_ll565log_y2 = log_rho_rr + log_p_ll566x2_plus_y2 = x2 + y2567y2_minus_x2 = y2 - x2568z2 = y2_minus_x2^2 / x2_plus_y2^2569special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2570regular_path2 = (log_y2 - log_x2) / y2_minus_x2571inv_rho_p_mean = p_ll * p_rr *572ifelse(z2 < 1.0e-4, special_path2, regular_path2)573574v1_avg = 0.5 * (v1_ll + v1_rr)575v2_avg = 0.5 * (v2_ll + v2_rr)576v3_avg = 0.5 * (v3_ll + v3_rr)577p_avg = 0.5 * (p_ll + p_rr)578velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)579580# Calculate fluxes depending on Cartesian orientation581f1 = rho_mean * v3_avg582f2 = f1 * v1_avg583f3 = f1 * v2_avg584f4 = f1 * v3_avg + p_avg585f5 = f1 *586(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +5870.5 * (p_ll * v3_rr + p_rr * v3_ll)588589# Add scaled fluxes to RHS590factor_k = alpha * derivative_split[k, kk]591du_reshaped[ij, k, 1] += factor_k * f1592du_reshaped[ij, k, 2] += factor_k * f2593du_reshaped[ij, k, 3] += factor_k * f3594du_reshaped[ij, k, 4] += factor_k * f4595du_reshaped[ij, k, 5] += factor_k * f5596597factor_kk = alpha * derivative_split[kk, k]598du_reshaped[ij, kk, 1] += factor_kk * f1599du_reshaped[ij, kk, 2] += factor_kk * f2600du_reshaped[ij, kk, 3] += factor_kk * f3601du_reshaped[ij, kk, 4] += factor_kk * f4602du_reshaped[ij, kk, 5] += factor_kk * f5603end604end605end # GC.@preserve u_prim606607# Finally, we add the temporary RHS computed here to the global RHS in the608# given `element`.609@turbo for v in eachvariable(equations),610k in eachnode(dg),611j in eachnode(dg),612i in eachnode(dg)613614_du[v, i, j, k, element] += du[i, j, k, v]615end616end617618619