Path: blob/main/src/solvers/dgsem_structured/dg_3d_compressible_euler.jl
5590 views
1# From here on, this file contains specializations of DG methods on the2# curved 3D meshes `StructuredMesh{3}, P4estMesh{3}` to the compressible3# Euler equations.4#5# The specialized methods contain relatively verbose and ugly code in the sense6# that we do not use the common "pointwise physics" interface of Trixi.jl.7# However, this is currently (November 2021) necessary to get a significant8# speed-up by using SIMD instructions via LoopVectorization.jl.9#10# TODO: SIMD. We could get even more speed-up if we did not need to permute11# array dimensions, i.e., if we changed the basic memory layout.12#13# We do not wrap this code in `@muladd begin ... end` block. Optimizations like14# this are handled automatically by LoopVectorization.jl.1516# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`17# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`18# works efficiently here.19@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,20element,21MeshT::Type{<:Union{StructuredMesh{3},22P4estMesh{3}}},23have_nonconservative_terms::False,24equations::CompressibleEulerEquations3D,25volume_flux::typeof(flux_shima_etal_turbo),26dg::DGSEM, cache, alpha)27@unpack derivative_split = dg.basis28@unpack contravariant_vectors = cache.elements2930# Create a temporary array that will be used to store the RHS with permuted31# indices `[i, j, k, v]` to allow using SIMD instructions.32# `StrideArray`s with purely static dimensions do not allocate on the heap.33du = StrideArray{eltype(u_cons)}(undef,34(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,35StaticInt(nvariables(equations))))3637# Convert conserved to primitive variables on the given `element`.38u_prim = StrideArray{eltype(u_cons)}(undef,39(ntuple(_ -> StaticInt(nnodes(dg)),40ndims(MeshT))...,41StaticInt(nvariables(equations))))4243@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)44rho = u_cons[1, i, j, k, element]45rho_v1 = u_cons[2, i, j, k, element]46rho_v2 = u_cons[3, i, j, k, element]47rho_v3 = u_cons[4, i, j, k, element]48rho_e_total = u_cons[5, i, j, k, element]4950v1 = rho_v1 / rho51v2 = rho_v2 / rho52v3 = rho_v3 / rho53p = (equations.gamma - 1) *54(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))5556u_prim[i, j, k, 1] = rho57u_prim[i, j, k, 2] = v158u_prim[i, j, k, 3] = v259u_prim[i, j, k, 4] = v360u_prim[i, j, k, 5] = p61end6263# x direction64# At first, we create new temporary arrays with permuted memory layout to65# allow using SIMD instructions along the first dimension (which is contiguous66# in memory).67du_permuted = StrideArray{eltype(u_cons)}(undef,68(StaticInt(nnodes(dg)^2),69StaticInt(nnodes(dg)),70StaticInt(nvariables(equations))))7172u_prim_permuted = StrideArray{eltype(u_cons)}(undef,73(StaticInt(nnodes(dg)^2),74StaticInt(nnodes(dg)),75StaticInt(nvariables(equations))))7677@turbo for v in eachvariable(equations),78k in eachnode(dg),79j in eachnode(dg),80i in eachnode(dg)8182jk = j + nnodes(dg) * (k - 1)83u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]84end85fill!(du_permuted, zero(eltype(du_permuted)))8687# We must also permute the contravariant vectors.88contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,89(StaticInt(nnodes(dg)^2),90StaticInt(nnodes(dg)),91StaticInt(ndims(MeshT))))9293@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)94jk = j + nnodes(dg) * (k - 1)95contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, element]96contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, element]97contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, element]98end99100# Next, we basically inline the volume flux. To allow SIMD vectorization and101# still use the symmetry of the volume flux and the derivative matrix, we102# loop over the triangular part in an outer loop and use a plain inner loop.103for i in eachnode(dg), ii in (i + 1):nnodes(dg)104@turbo for jk in Base.OneTo(nnodes(dg)^2)105rho_ll = u_prim_permuted[jk, i, 1]106v1_ll = u_prim_permuted[jk, i, 2]107v2_ll = u_prim_permuted[jk, i, 3]108v3_ll = u_prim_permuted[jk, i, 4]109p_ll = u_prim_permuted[jk, i, 5]110111rho_rr = u_prim_permuted[jk, ii, 1]112v1_rr = u_prim_permuted[jk, ii, 2]113v2_rr = u_prim_permuted[jk, ii, 3]114v3_rr = u_prim_permuted[jk, ii, 4]115p_rr = u_prim_permuted[jk, ii, 5]116117normal_direction_1 = 0.5 * (contravariant_vectors_x[jk, i, 1] +118contravariant_vectors_x[jk, ii, 1])119normal_direction_2 = 0.5 * (contravariant_vectors_x[jk, i, 2] +120contravariant_vectors_x[jk, ii, 2])121normal_direction_3 = 0.5 * (contravariant_vectors_x[jk, i, 3] +122contravariant_vectors_x[jk, ii, 3])123124v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +125v3_ll * normal_direction_3126v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +127v3_rr * normal_direction_3128129# Compute required mean values130rho_avg = 0.5 * (rho_ll + rho_rr)131v1_avg = 0.5 * (v1_ll + v1_rr)132v2_avg = 0.5 * (v2_ll + v2_rr)133v3_avg = 0.5 * (v3_ll + v3_rr)134v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)135p_avg = 0.5 * (p_ll + p_rr)136velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)137138# Calculate fluxes depending on normal_direction139f1 = rho_avg * v_dot_n_avg140f2 = f1 * v1_avg + p_avg * normal_direction_1141f3 = f1 * v2_avg + p_avg * normal_direction_2142f4 = f1 * v3_avg + p_avg * normal_direction_3143f5 = (f1 * velocity_square_avg +144p_avg * v_dot_n_avg * equations.inv_gamma_minus_one145+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))146147# Add scaled fluxes to RHS148factor_i = alpha * derivative_split[i, ii]149du_permuted[jk, i, 1] += factor_i * f1150du_permuted[jk, i, 2] += factor_i * f2151du_permuted[jk, i, 3] += factor_i * f3152du_permuted[jk, i, 4] += factor_i * f4153du_permuted[jk, i, 5] += factor_i * f5154155factor_ii = alpha * derivative_split[ii, i]156du_permuted[jk, ii, 1] += factor_ii * f1157du_permuted[jk, ii, 2] += factor_ii * f2158du_permuted[jk, ii, 3] += factor_ii * f3159du_permuted[jk, ii, 4] += factor_ii * f4160du_permuted[jk, ii, 5] += factor_ii * f5161end162end163164@turbo for v in eachvariable(equations),165k in eachnode(dg),166j in eachnode(dg),167i in eachnode(dg)168169jk = j + nnodes(dg) * (k - 1)170du[i, j, k, v] = du_permuted[jk, i, v]171end172173# y direction174# We must also permute the contravariant vectors.175contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,176(StaticInt(nnodes(dg)),177StaticInt(nnodes(dg)),178StaticInt(nnodes(dg)),179StaticInt(ndims(MeshT))))180181@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)182contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]183contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, element]184contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, element]185end186187# A possible permutation of array dimensions with improved opportunities for188# SIMD vectorization appeared to be slower than the direct version used here189# in preliminary numerical experiments on an AVX2 system.190for j in eachnode(dg), jj in (j + 1):nnodes(dg)191@turbo for k in eachnode(dg), i in eachnode(dg)192rho_ll = u_prim[i, j, k, 1]193v1_ll = u_prim[i, j, k, 2]194v2_ll = u_prim[i, j, k, 3]195v3_ll = u_prim[i, j, k, 4]196p_ll = u_prim[i, j, k, 5]197198rho_rr = u_prim[i, jj, k, 1]199v1_rr = u_prim[i, jj, k, 2]200v2_rr = u_prim[i, jj, k, 3]201v3_rr = u_prim[i, jj, k, 4]202p_rr = u_prim[i, jj, k, 5]203204normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, k, 1] +205contravariant_vectors_y[i, jj, k, 1])206normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, k, 2] +207contravariant_vectors_y[i, jj, k, 2])208normal_direction_3 = 0.5 * (contravariant_vectors_y[i, j, k, 3] +209contravariant_vectors_y[i, jj, k, 3])210211v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +212v3_ll * normal_direction_3213v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +214v3_rr * normal_direction_3215216# Compute required mean values217rho_avg = 0.5 * (rho_ll + rho_rr)218v1_avg = 0.5 * (v1_ll + v1_rr)219v2_avg = 0.5 * (v2_ll + v2_rr)220v3_avg = 0.5 * (v3_ll + v3_rr)221v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)222p_avg = 0.5 * (p_ll + p_rr)223velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)224225# Calculate fluxes depending on normal_direction226f1 = rho_avg * v_dot_n_avg227f2 = f1 * v1_avg + p_avg * normal_direction_1228f3 = f1 * v2_avg + p_avg * normal_direction_2229f4 = f1 * v3_avg + p_avg * normal_direction_3230f5 = (f1 * velocity_square_avg +231p_avg * v_dot_n_avg * equations.inv_gamma_minus_one232+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))233234# Add scaled fluxes to RHS235factor_j = alpha * derivative_split[j, jj]236du[i, j, k, 1] += factor_j * f1237du[i, j, k, 2] += factor_j * f2238du[i, j, k, 3] += factor_j * f3239du[i, j, k, 4] += factor_j * f4240du[i, j, k, 5] += factor_j * f5241242factor_jj = alpha * derivative_split[jj, j]243du[i, jj, k, 1] += factor_jj * f1244du[i, jj, k, 2] += factor_jj * f2245du[i, jj, k, 3] += factor_jj * f3246du[i, jj, k, 4] += factor_jj * f4247du[i, jj, k, 5] += factor_jj * f5248end249end250251# z direction252# The memory layout is already optimal for SIMD vectorization in this loop.253# We just squeeze the first two dimensions to make the code slightly faster.254GC.@preserve u_prim begin255u_prim_reshaped = PtrArray(pointer(u_prim),256(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),257StaticInt(nvariables(equations))))258259du_reshaped = PtrArray(pointer(du),260(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),261StaticInt(nvariables(equations))))262263# We must also permute the contravariant vectors.264contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,265(StaticInt(nnodes(dg)^2),266StaticInt(nnodes(dg)),267StaticInt(ndims(MeshT))))268269@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)270ij = i + nnodes(dg) * (j - 1)271contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,272element]273contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,274element]275contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,276element]277end278279for k in eachnode(dg), kk in (k + 1):nnodes(dg)280@turbo for ij in Base.OneTo(nnodes(dg)^2)281rho_ll = u_prim_reshaped[ij, k, 1]282v1_ll = u_prim_reshaped[ij, k, 2]283v2_ll = u_prim_reshaped[ij, k, 3]284v3_ll = u_prim_reshaped[ij, k, 4]285p_ll = u_prim_reshaped[ij, k, 5]286287rho_rr = u_prim_reshaped[ij, kk, 1]288v1_rr = u_prim_reshaped[ij, kk, 2]289v2_rr = u_prim_reshaped[ij, kk, 3]290v3_rr = u_prim_reshaped[ij, kk, 4]291p_rr = u_prim_reshaped[ij, kk, 5]292293normal_direction_1 = 0.5 * (contravariant_vectors_z[ij, k, 1] +294contravariant_vectors_z[ij, kk, 1])295normal_direction_2 = 0.5 * (contravariant_vectors_z[ij, k, 2] +296contravariant_vectors_z[ij, kk, 2])297normal_direction_3 = 0.5 * (contravariant_vectors_z[ij, k, 3] +298contravariant_vectors_z[ij, kk, 3])299300v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +301v3_ll * normal_direction_3302v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +303v3_rr * normal_direction_3304305# Compute required mean values306rho_avg = 0.5 * (rho_ll + rho_rr)307v1_avg = 0.5 * (v1_ll + v1_rr)308v2_avg = 0.5 * (v2_ll + v2_rr)309v3_avg = 0.5 * (v3_ll + v3_rr)310v_dot_n_avg = 0.5 * (v_dot_n_ll + v_dot_n_rr)311p_avg = 0.5 * (p_ll + p_rr)312velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)313314# Calculate fluxes depending on normal_direction315f1 = rho_avg * v_dot_n_avg316f2 = f1 * v1_avg + p_avg * normal_direction_1317f3 = f1 * v2_avg + p_avg * normal_direction_2318f4 = f1 * v3_avg + p_avg * normal_direction_3319f5 = (f1 * velocity_square_avg +320p_avg * v_dot_n_avg * equations.inv_gamma_minus_one321+ 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))322323# Add scaled fluxes to RHS324factor_k = alpha * derivative_split[k, kk]325du_reshaped[ij, k, 1] += factor_k * f1326du_reshaped[ij, k, 2] += factor_k * f2327du_reshaped[ij, k, 3] += factor_k * f3328du_reshaped[ij, k, 4] += factor_k * f4329du_reshaped[ij, k, 5] += factor_k * f5330331factor_kk = alpha * derivative_split[kk, k]332du_reshaped[ij, kk, 1] += factor_kk * f1333du_reshaped[ij, kk, 2] += factor_kk * f2334du_reshaped[ij, kk, 3] += factor_kk * f3335du_reshaped[ij, kk, 4] += factor_kk * f4336du_reshaped[ij, kk, 5] += factor_kk * f5337end338end339end # GC.@preserve u_prim begin340341# Finally, we add the temporary RHS computed here to the global RHS in the342# given `element`.343@turbo for v in eachvariable(equations),344k in eachnode(dg),345j in eachnode(dg),346i in eachnode(dg)347348_du[v, i, j, k, element] += du[i, j, k, v]349end350end351352@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,353element,354MeshT::Type{<:Union{StructuredMesh{3},355P4estMesh{3}}},356have_nonconservative_terms::False,357equations::CompressibleEulerEquations3D,358volume_flux::typeof(flux_ranocha_turbo),359dg::DGSEM, cache, alpha)360@unpack derivative_split = dg.basis361@unpack contravariant_vectors = cache.elements362363# Create a temporary array that will be used to store the RHS with permuted364# indices `[i, j, k, v]` to allow using SIMD instructions.365# `StrideArray`s with purely static dimensions do not allocate on the heap.366du = StrideArray{eltype(u_cons)}(undef,367(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,368StaticInt(nvariables(equations))))369370# Convert conserved to primitive variables on the given `element`. In addition371# to the usual primitive variables, we also compute logarithms of the density372# and pressure to increase the performance of the required logarithmic mean373# values.374u_prim = StrideArray{eltype(u_cons)}(undef,375(ntuple(_ -> StaticInt(nnodes(dg)),376ndims(MeshT))...,377StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs378379@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)380rho = u_cons[1, i, j, k, element]381rho_v1 = u_cons[2, i, j, k, element]382rho_v2 = u_cons[3, i, j, k, element]383rho_v3 = u_cons[4, i, j, k, element]384rho_e_total = u_cons[5, i, j, k, element]385386v1 = rho_v1 / rho387v2 = rho_v2 / rho388v3 = rho_v3 / rho389p = (equations.gamma - 1) *390(rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))391392u_prim[i, j, k, 1] = rho393u_prim[i, j, k, 2] = v1394u_prim[i, j, k, 3] = v2395u_prim[i, j, k, 4] = v3396u_prim[i, j, k, 5] = p397u_prim[i, j, k, 6] = log(rho)398u_prim[i, j, k, 7] = log(p)399end400401# x direction402# At first, we create new temporary arrays with permuted memory layout to403# allow using SIMD instructions along the first dimension (which is contiguous404# in memory).405du_permuted = StrideArray{eltype(u_cons)}(undef,406(StaticInt(nnodes(dg)^2),407StaticInt(nnodes(dg)),408StaticInt(nvariables(equations))))409410u_prim_permuted = StrideArray{eltype(u_cons)}(undef,411(StaticInt(nnodes(dg)^2),412StaticInt(nnodes(dg)),413StaticInt(nvariables(equations) + 2)))414415@turbo for v in indices(u_prim, 4), # v in eachvariable(equations) misses +2 logs416k in eachnode(dg),417j in eachnode(dg),418i in eachnode(dg)419420jk = j + nnodes(dg) * (k - 1)421u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]422end423fill!(du_permuted, zero(eltype(du_permuted)))424425# We must also permute the contravariant vectors.426contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,427(StaticInt(nnodes(dg)^2),428StaticInt(nnodes(dg)),429StaticInt(ndims(MeshT))))430431@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)432jk = j + nnodes(dg) * (k - 1)433contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, element]434contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, element]435contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, element]436end437438# Next, we basically inline the volume flux. To allow SIMD vectorization and439# still use the symmetry of the volume flux and the derivative matrix, we440# loop over the triangular part in an outer loop and use a plain inner loop.441for i in eachnode(dg), ii in (i + 1):nnodes(dg)442@turbo for jk in Base.OneTo(nnodes(dg)^2)443rho_ll = u_prim_permuted[jk, i, 1]444v1_ll = u_prim_permuted[jk, i, 2]445v2_ll = u_prim_permuted[jk, i, 3]446v3_ll = u_prim_permuted[jk, i, 4]447p_ll = u_prim_permuted[jk, i, 5]448log_rho_ll = u_prim_permuted[jk, i, 6]449log_p_ll = u_prim_permuted[jk, i, 7]450451rho_rr = u_prim_permuted[jk, ii, 1]452v1_rr = u_prim_permuted[jk, ii, 2]453v2_rr = u_prim_permuted[jk, ii, 3]454v3_rr = u_prim_permuted[jk, ii, 4]455p_rr = u_prim_permuted[jk, ii, 5]456log_rho_rr = u_prim_permuted[jk, ii, 6]457log_p_rr = u_prim_permuted[jk, ii, 7]458459normal_direction_1 = 0.5 * (contravariant_vectors_x[jk, i, 1] +460contravariant_vectors_x[jk, ii, 1])461normal_direction_2 = 0.5 * (contravariant_vectors_x[jk, i, 2] +462contravariant_vectors_x[jk, ii, 2])463normal_direction_3 = 0.5 * (contravariant_vectors_x[jk, i, 3] +464contravariant_vectors_x[jk, ii, 3])465466v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +467v3_ll * normal_direction_3468v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +469v3_rr * normal_direction_3470471# Compute required mean values472# We inline the logarithmic mean to allow LoopVectorization.jl to optimize473# it efficiently. This is equivalent to474# rho_mean = ln_mean(rho_ll, rho_rr)475x1 = rho_ll476log_x1 = log_rho_ll477y1 = rho_rr478log_y1 = log_rho_rr479x1_plus_y1 = x1 + y1480y1_minus_x1 = y1 - x1481z1 = y1_minus_x1^2 / x1_plus_y1^2482special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))483regular_path1 = y1_minus_x1 / (log_y1 - log_x1)484rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)485486# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`487# in exact arithmetic since488# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)489# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)490# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)491x2 = rho_ll * p_rr492log_x2 = log_rho_ll + log_p_rr493y2 = rho_rr * p_ll494log_y2 = log_rho_rr + log_p_ll495x2_plus_y2 = x2 + y2496y2_minus_x2 = y2 - x2497z2 = y2_minus_x2^2 / x2_plus_y2^2498special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2499regular_path2 = (log_y2 - log_x2) / y2_minus_x2500inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)501502v1_avg = 0.5 * (v1_ll + v1_rr)503v2_avg = 0.5 * (v2_ll + v2_rr)504v3_avg = 0.5 * (v3_ll + v3_rr)505p_avg = 0.5 * (p_ll + p_rr)506velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)507508# Calculate fluxes depending on normal_direction509f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)510f2 = f1 * v1_avg + p_avg * normal_direction_1511f3 = f1 * v2_avg + p_avg * normal_direction_2512f4 = f1 * v3_avg + p_avg * normal_direction_3513f5 = (f1 *514(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)515+5160.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))517518# Add scaled fluxes to RHS519factor_i = alpha * derivative_split[i, ii]520du_permuted[jk, i, 1] += factor_i * f1521du_permuted[jk, i, 2] += factor_i * f2522du_permuted[jk, i, 3] += factor_i * f3523du_permuted[jk, i, 4] += factor_i * f4524du_permuted[jk, i, 5] += factor_i * f5525526factor_ii = alpha * derivative_split[ii, i]527du_permuted[jk, ii, 1] += factor_ii * f1528du_permuted[jk, ii, 2] += factor_ii * f2529du_permuted[jk, ii, 3] += factor_ii * f3530du_permuted[jk, ii, 4] += factor_ii * f4531du_permuted[jk, ii, 5] += factor_ii * f5532end533end534535@turbo for v in eachvariable(equations),536k in eachnode(dg),537j in eachnode(dg),538i in eachnode(dg)539540jk = j + nnodes(dg) * (k - 1)541du[i, j, k, v] = du_permuted[jk, i, v]542end543544# y direction545# We must also permute the contravariant vectors.546contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,547(StaticInt(nnodes(dg)),548StaticInt(nnodes(dg)),549StaticInt(nnodes(dg)),550StaticInt(ndims(MeshT))))551552@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)553contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, element]554contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, element]555contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, element]556end557558# A possible permutation of array dimensions with improved opportunities for559# SIMD vectorization appeared to be slower than the direct version used here560# in preliminary numerical experiments on an AVX2 system.561for j in eachnode(dg), jj in (j + 1):nnodes(dg)562@turbo for k in eachnode(dg), i in eachnode(dg)563rho_ll = u_prim[i, j, k, 1]564v1_ll = u_prim[i, j, k, 2]565v2_ll = u_prim[i, j, k, 3]566v3_ll = u_prim[i, j, k, 4]567p_ll = u_prim[i, j, k, 5]568log_rho_ll = u_prim[i, j, k, 6]569log_p_ll = u_prim[i, j, k, 7]570571rho_rr = u_prim[i, jj, k, 1]572v1_rr = u_prim[i, jj, k, 2]573v2_rr = u_prim[i, jj, k, 3]574v3_rr = u_prim[i, jj, k, 4]575p_rr = u_prim[i, jj, k, 5]576log_rho_rr = u_prim[i, jj, k, 6]577log_p_rr = u_prim[i, jj, k, 7]578579normal_direction_1 = 0.5 * (contravariant_vectors_y[i, j, k, 1] +580contravariant_vectors_y[i, jj, k, 1])581normal_direction_2 = 0.5 * (contravariant_vectors_y[i, j, k, 2] +582contravariant_vectors_y[i, jj, k, 2])583normal_direction_3 = 0.5 * (contravariant_vectors_y[i, j, k, 3] +584contravariant_vectors_y[i, jj, k, 3])585586v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +587v3_ll * normal_direction_3588v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +589v3_rr * normal_direction_3590591# Compute required mean values592# We inline the logarithmic mean to allow LoopVectorization.jl to optimize593# it efficiently. This is equivalent to594# rho_mean = ln_mean(rho_ll, rho_rr)595x1 = rho_ll596log_x1 = log_rho_ll597y1 = rho_rr598log_y1 = log_rho_rr599x1_plus_y1 = x1 + y1600y1_minus_x1 = y1 - x1601z1 = y1_minus_x1^2 / x1_plus_y1^2602special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))603regular_path1 = y1_minus_x1 / (log_y1 - log_x1)604rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)605606# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`607# in exact arithmetic since608# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)609# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)610# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)611x2 = rho_ll * p_rr612log_x2 = log_rho_ll + log_p_rr613y2 = rho_rr * p_ll614log_y2 = log_rho_rr + log_p_ll615x2_plus_y2 = x2 + y2616y2_minus_x2 = y2 - x2617z2 = y2_minus_x2^2 / x2_plus_y2^2618special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2619regular_path2 = (log_y2 - log_x2) / y2_minus_x2620inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)621622v1_avg = 0.5 * (v1_ll + v1_rr)623v2_avg = 0.5 * (v2_ll + v2_rr)624v3_avg = 0.5 * (v3_ll + v3_rr)625p_avg = 0.5 * (p_ll + p_rr)626velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)627628# Calculate fluxes depending on normal_direction629f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)630f2 = f1 * v1_avg + p_avg * normal_direction_1631f3 = f1 * v2_avg + p_avg * normal_direction_2632f4 = f1 * v3_avg + p_avg * normal_direction_3633f5 = (f1 *634(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)635+6360.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))637638# Add scaled fluxes to RHS639factor_j = alpha * derivative_split[j, jj]640du[i, j, k, 1] += factor_j * f1641du[i, j, k, 2] += factor_j * f2642du[i, j, k, 3] += factor_j * f3643du[i, j, k, 4] += factor_j * f4644du[i, j, k, 5] += factor_j * f5645646factor_jj = alpha * derivative_split[jj, j]647du[i, jj, k, 1] += factor_jj * f1648du[i, jj, k, 2] += factor_jj * f2649du[i, jj, k, 3] += factor_jj * f3650du[i, jj, k, 4] += factor_jj * f4651du[i, jj, k, 5] += factor_jj * f5652end653end654655# z direction656# The memory layout is already optimal for SIMD vectorization in this loop.657# We just squeeze the first two dimensions to make the code slightly faster.658GC.@preserve u_prim begin659u_prim_reshaped = PtrArray(pointer(u_prim),660(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),661StaticInt(nvariables(equations) + 2)))662663du_reshaped = PtrArray(pointer(du),664(StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),665StaticInt(nvariables(equations))))666667# We must also permute the contravariant vectors.668contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,669(StaticInt(nnodes(dg)^2),670StaticInt(nnodes(dg)),671StaticInt(ndims(MeshT))))672673@turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)674ij = i + nnodes(dg) * (j - 1)675contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,676element]677contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,678element]679contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,680element]681end682683for k in eachnode(dg), kk in (k + 1):nnodes(dg)684@turbo for ij in Base.OneTo(nnodes(dg)^2)685rho_ll = u_prim_reshaped[ij, k, 1]686v1_ll = u_prim_reshaped[ij, k, 2]687v2_ll = u_prim_reshaped[ij, k, 3]688v3_ll = u_prim_reshaped[ij, k, 4]689p_ll = u_prim_reshaped[ij, k, 5]690log_rho_ll = u_prim_reshaped[ij, k, 6]691log_p_ll = u_prim_reshaped[ij, k, 7]692693rho_rr = u_prim_reshaped[ij, kk, 1]694v1_rr = u_prim_reshaped[ij, kk, 2]695v2_rr = u_prim_reshaped[ij, kk, 3]696v3_rr = u_prim_reshaped[ij, kk, 4]697p_rr = u_prim_reshaped[ij, kk, 5]698log_rho_rr = u_prim_reshaped[ij, kk, 6]699log_p_rr = u_prim_reshaped[ij, kk, 7]700701normal_direction_1 = 0.5 * (contravariant_vectors_z[ij, k, 1] +702contravariant_vectors_z[ij, kk, 1])703normal_direction_2 = 0.5 * (contravariant_vectors_z[ij, k, 2] +704contravariant_vectors_z[ij, kk, 2])705normal_direction_3 = 0.5 * (contravariant_vectors_z[ij, k, 3] +706contravariant_vectors_z[ij, kk, 3])707708v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +709v3_ll * normal_direction_3710v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +711v3_rr * normal_direction_3712713# Compute required mean values714# We inline the logarithmic mean to allow LoopVectorization.jl to optimize715# it efficiently. This is equivalent to716# rho_mean = ln_mean(rho_ll, rho_rr)717x1 = rho_ll718log_x1 = log_rho_ll719y1 = rho_rr720log_y1 = log_rho_rr721x1_plus_y1 = x1 + y1722y1_minus_x1 = y1 - x1723z1 = y1_minus_x1^2 / x1_plus_y1^2724special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))725regular_path1 = y1_minus_x1 / (log_y1 - log_x1)726rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)727728# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`729# in exact arithmetic since730# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)731# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)732# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)733x2 = rho_ll * p_rr734log_x2 = log_rho_ll + log_p_rr735y2 = rho_rr * p_ll736log_y2 = log_rho_rr + log_p_ll737x2_plus_y2 = x2 + y2738y2_minus_x2 = y2 - x2739z2 = y2_minus_x2^2 / x2_plus_y2^2740special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2741regular_path2 = (log_y2 - log_x2) / y2_minus_x2742inv_rho_p_mean = p_ll * p_rr *743ifelse(z2 < 1.0e-4, special_path2, regular_path2)744745v1_avg = 0.5 * (v1_ll + v1_rr)746v2_avg = 0.5 * (v2_ll + v2_rr)747v3_avg = 0.5 * (v3_ll + v3_rr)748p_avg = 0.5 * (p_ll + p_rr)749velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)750751# Calculate fluxes depending on normal_direction752f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)753f2 = f1 * v1_avg + p_avg * normal_direction_1754f3 = f1 * v2_avg + p_avg * normal_direction_2755f4 = f1 * v3_avg + p_avg * normal_direction_3756f5 = (f1 *757(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)758+7590.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))760761# Add scaled fluxes to RHS762factor_k = alpha * derivative_split[k, kk]763du_reshaped[ij, k, 1] += factor_k * f1764du_reshaped[ij, k, 2] += factor_k * f2765du_reshaped[ij, k, 3] += factor_k * f3766du_reshaped[ij, k, 4] += factor_k * f4767du_reshaped[ij, k, 5] += factor_k * f5768769factor_kk = alpha * derivative_split[kk, k]770du_reshaped[ij, kk, 1] += factor_kk * f1771du_reshaped[ij, kk, 2] += factor_kk * f2772du_reshaped[ij, kk, 3] += factor_kk * f3773du_reshaped[ij, kk, 4] += factor_kk * f4774du_reshaped[ij, kk, 5] += factor_kk * f5775end776end777end # GC.@preserve u_prim begin778779# Finally, we add the temporary RHS computed here to the global RHS in the780# given `element`.781@turbo for v in eachvariable(equations),782k in eachnode(dg),783j in eachnode(dg),784i in eachnode(dg)785786_du[v, i, j, k, element] += du[i, j, k, v]787end788end789790791