Path: blob/main/src/solvers/dgsem_tree/dg_2d_compressible_euler.jl
5591 views
# By default, Julia/LLVM does not use fused multiply-add operations (FMAs).1# Since these FMAs can increase the performance of many numerical algorithms,2# we need to opt-in explicitly.3# See https://ranocha.de/blog/Optimizing_EC_Trixi for further details.4@muladd begin5#! format: noindent67# Calculate the vorticity on a single node using the derivative matrix from the polynomial basis of8# a DGSEM solver. `u` is the solution on the whole domain.9# This function is used for calculating acoustic source terms for coupled Euler-acoustics10# simulations.11function calc_vorticity_node(u, mesh::TreeMesh{2},12equations::CompressibleEulerEquations2D,13dg::DGSEM, cache, i, j, element)14@unpack derivative_matrix = dg.basis1516v2_x = zero(eltype(u)) # derivative of v2 in x direction17for ii in eachnode(dg)18rho, _, rho_v2 = get_node_vars(u, equations, dg, ii, j, element)19v2 = rho_v2 / rho20v2_x = v2_x + derivative_matrix[i, ii] * v221end2223v1_y = zero(eltype(u)) # derivative of v1 in y direction24for jj in eachnode(dg)25rho, rho_v1 = get_node_vars(u, equations, dg, i, jj, element)26v1 = rho_v1 / rho27v1_y = v1_y + derivative_matrix[j, jj] * v128end2930return (v2_x - v1_y) * cache.elements.inverse_jacobian[element]31end3233# Convenience function for calculating the vorticity on the whole domain and storing it in a34# preallocated array35function calc_vorticity!(vorticity, u, mesh::TreeMesh{2},36equations::CompressibleEulerEquations2D,37dg::DGSEM, cache)38@threaded for element in eachelement(dg, cache)39for j in eachnode(dg), i in eachnode(dg)40vorticity[i, j, element] = calc_vorticity_node(u, mesh, equations, dg,41cache, i, j, element)42end43end4445return nothing46end47end # muladd4849# From here on, this file contains specializations of DG methods on the50# TreeMesh2D to the compressible Euler equations.51#52# The specialized methods contain relatively verbose and ugly code in the sense53# that we do not use the common "pointwise physics" interface of Trixi.jl.54# However, this is currently (November 2021) necessary to get a significant55# speed-up by using SIMD instructions via LoopVectorization.jl.56#57# TODO: SIMD. We could get even more speed-up if we did not need to permute58# array dimensions, i.e., if we changed the basic memory layout.59#60# We do not wrap this code in `@muladd begin ... end` block. Optimizations like61# this are handled automatically by LoopVectorization.jl.6263# We specialize on `PtrArray` since these will be returned by `Trixi.wrap_array`64# if LoopVectorization.jl can handle the array types. This ensures that `@turbo`65# works efficiently here.66@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,67element, MeshT::Type{<:TreeMesh{2}},68have_nonconservative_terms::False,69equations::CompressibleEulerEquations2D,70volume_flux::typeof(flux_shima_etal_turbo),71dg::DGSEM, cache, alpha)72@unpack derivative_split = dg.basis7374# Create a temporary array that will be used to store the RHS with permuted75# indices `[i, j, v]` to allow using SIMD instructions.76# `StrideArray`s with purely static dimensions do not allocate on the heap.77du = StrideArray{eltype(u_cons)}(undef,78(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,79StaticInt(nvariables(equations))))8081# Convert conserved to primitive variables on the given `element`.82u_prim = StrideArray{eltype(u_cons)}(undef,83(ntuple(_ -> StaticInt(nnodes(dg)),84ndims(MeshT))...,85StaticInt(nvariables(equations))))8687@turbo for j in eachnode(dg), i in eachnode(dg)88rho = u_cons[1, i, j, element]89rho_v1 = u_cons[2, i, j, element]90rho_v2 = u_cons[3, i, j, element]91rho_e_total = u_cons[4, i, j, element]9293v1 = rho_v1 / rho94v2 = rho_v2 / rho95p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))9697u_prim[i, j, 1] = rho98u_prim[i, j, 2] = v199u_prim[i, j, 3] = v2100u_prim[i, j, 4] = p101end102103# x direction104# At first, we create new temporary arrays with permuted memory layout to105# allow using SIMD instructions along the first dimension (which is contiguous106# in memory).107du_permuted = StrideArray{eltype(u_cons)}(undef,108(StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),109StaticInt(nvariables(equations))))110111u_prim_permuted = StrideArray{eltype(u_cons)}(undef,112(StaticInt(nnodes(dg)),113StaticInt(nnodes(dg)),114StaticInt(nvariables(equations))))115116@turbo for v in eachvariable(equations),117j in eachnode(dg),118i in eachnode(dg)119120u_prim_permuted[j, i, v] = u_prim[i, j, v]121end122fill!(du_permuted, zero(eltype(du_permuted)))123124# Next, we basically inline the volume flux. To allow SIMD vectorization and125# still use the symmetry of the volume flux and the derivative matrix, we126# loop over the triangular part in an outer loop and use a plain inner loop.127for i in eachnode(dg), ii in (i + 1):nnodes(dg)128@turbo for j in eachnode(dg)129rho_ll = u_prim_permuted[j, i, 1]130v1_ll = u_prim_permuted[j, i, 2]131v2_ll = u_prim_permuted[j, i, 3]132p_ll = u_prim_permuted[j, i, 4]133134rho_rr = u_prim_permuted[j, ii, 1]135v1_rr = u_prim_permuted[j, ii, 2]136v2_rr = u_prim_permuted[j, ii, 3]137p_rr = u_prim_permuted[j, ii, 4]138139# Compute required mean values140rho_avg = 0.5 * (rho_ll + rho_rr)141v1_avg = 0.5 * (v1_ll + v1_rr)142v2_avg = 0.5 * (v2_ll + v2_rr)143p_avg = 0.5 * (p_ll + p_rr)144kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)145pv1_avg = 0.5 * (p_ll * v1_rr + p_rr * v1_ll)146147# Calculate fluxes depending on Cartesian orientation148f1 = rho_avg * v1_avg149f2 = f1 * v1_avg + p_avg150f3 = f1 * v2_avg151f4 = p_avg * v1_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv1_avg152153# Add scaled fluxes to RHS154factor_i = alpha * derivative_split[i, ii]155du_permuted[j, i, 1] += factor_i * f1156du_permuted[j, i, 2] += factor_i * f2157du_permuted[j, i, 3] += factor_i * f3158du_permuted[j, i, 4] += factor_i * f4159160factor_ii = alpha * derivative_split[ii, i]161du_permuted[j, ii, 1] += factor_ii * f1162du_permuted[j, ii, 2] += factor_ii * f2163du_permuted[j, ii, 3] += factor_ii * f3164du_permuted[j, ii, 4] += factor_ii * f4165end166end167168@turbo for v in eachvariable(equations),169j in eachnode(dg),170i in eachnode(dg)171172du[i, j, v] = du_permuted[j, i, v]173end174175# y direction176# The memory layout is already optimal for SIMD vectorization in this loop.177for j in eachnode(dg), jj in (j + 1):nnodes(dg)178@turbo for i in eachnode(dg)179rho_ll = u_prim[i, j, 1]180v1_ll = u_prim[i, j, 2]181v2_ll = u_prim[i, j, 3]182p_ll = u_prim[i, j, 4]183184rho_rr = u_prim[i, jj, 1]185v1_rr = u_prim[i, jj, 2]186v2_rr = u_prim[i, jj, 3]187p_rr = u_prim[i, jj, 4]188189# Compute required mean values190rho_avg = 0.5 * (rho_ll + rho_rr)191v1_avg = 0.5 * (v1_ll + v1_rr)192v2_avg = 0.5 * (v2_ll + v2_rr)193p_avg = 0.5 * (p_ll + p_rr)194kin_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)195pv2_avg = 0.5 * (p_ll * v2_rr + p_rr * v2_ll)196197# Calculate fluxes depending on Cartesian orientation198f1 = rho_avg * v2_avg199f2 = f1 * v1_avg200f3 = f1 * v2_avg + p_avg201f4 = p_avg * v2_avg * equations.inv_gamma_minus_one + f1 * kin_avg + pv2_avg202203# Add scaled fluxes to RHS204factor_j = alpha * derivative_split[j, jj]205du[i, j, 1] += factor_j * f1206du[i, j, 2] += factor_j * f2207du[i, j, 3] += factor_j * f3208du[i, j, 4] += factor_j * f4209210factor_jj = alpha * derivative_split[jj, j]211du[i, jj, 1] += factor_jj * f1212du[i, jj, 2] += factor_jj * f2213du[i, jj, 3] += factor_jj * f3214du[i, jj, 4] += factor_jj * f4215end216end217218# Finally, we add the temporary RHS computed here to the global RHS in the219# given `element`.220@turbo for v in eachvariable(equations),221j in eachnode(dg),222i in eachnode(dg)223224_du[v, i, j, element] += du[i, j, v]225end226end227228@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray,229element, MeshT::Type{<:TreeMesh{2}},230have_nonconservative_terms::False,231equations::CompressibleEulerEquations2D,232volume_flux::typeof(flux_ranocha_turbo),233dg::DGSEM, cache, alpha)234@unpack derivative_split = dg.basis235236# Create a temporary array that will be used to store the RHS with permuted237# indices `[i, j, v]` to allow using SIMD instructions.238# `StrideArray`s with purely static dimensions do not allocate on the heap.239du = StrideArray{eltype(u_cons)}(undef,240(ntuple(_ -> StaticInt(nnodes(dg)), ndims(MeshT))...,241StaticInt(nvariables(equations))))242243# Convert conserved to primitive variables on the given `element`. In addition244# to the usual primitive variables, we also compute logarithms of the density245# and pressure to increase the performance of the required logarithmic mean246# values.247u_prim = StrideArray{eltype(u_cons)}(undef,248(ntuple(_ -> StaticInt(nnodes(dg)),249ndims(MeshT))...,250StaticInt(nvariables(equations) + 2))) # We also compute "+ 2" logs251252@turbo for j in eachnode(dg), i in eachnode(dg)253rho = u_cons[1, i, j, element]254rho_v1 = u_cons[2, i, j, element]255rho_v2 = u_cons[3, i, j, element]256rho_e_total = u_cons[4, i, j, element]257258v1 = rho_v1 / rho259v2 = rho_v2 / rho260p = (equations.gamma - 1) * (rho_e_total - 0.5 * (rho_v1 * v1 + rho_v2 * v2))261262u_prim[i, j, 1] = rho263u_prim[i, j, 2] = v1264u_prim[i, j, 3] = v2265u_prim[i, j, 4] = p266u_prim[i, j, 5] = log(rho)267u_prim[i, j, 6] = log(p)268end269270# x direction271# At first, we create new temporary arrays with permuted memory layout to272# allow using SIMD instructions along the first dimension (which is contiguous273# in memory).274du_permuted = StrideArray{eltype(u_cons)}(undef,275(StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),276StaticInt(nvariables(equations))))277278u_prim_permuted = StrideArray{eltype(u_cons)}(undef,279(StaticInt(nnodes(dg)),280StaticInt(nnodes(dg)),281StaticInt(nvariables(equations) + 2)))282283@turbo for v in indices(u_prim, 3), # v in eachvariable(equations) misses +2 logs284j in eachnode(dg),285i in eachnode(dg)286287u_prim_permuted[j, i, v] = u_prim[i, j, v]288end289fill!(du_permuted, zero(eltype(du_permuted)))290291# Next, we basically inline the volume flux. To allow SIMD vectorization and292# still use the symmetry of the volume flux and the derivative matrix, we293# loop over the triangular part in an outer loop and use a plain inner loop.294for i in eachnode(dg), ii in (i + 1):nnodes(dg)295@turbo for j in eachnode(dg)296rho_ll = u_prim_permuted[j, i, 1]297v1_ll = u_prim_permuted[j, i, 2]298v2_ll = u_prim_permuted[j, i, 3]299p_ll = u_prim_permuted[j, i, 4]300log_rho_ll = u_prim_permuted[j, i, 5]301log_p_ll = u_prim_permuted[j, i, 6]302303rho_rr = u_prim_permuted[j, ii, 1]304v1_rr = u_prim_permuted[j, ii, 2]305v2_rr = u_prim_permuted[j, ii, 3]306p_rr = u_prim_permuted[j, ii, 4]307log_rho_rr = u_prim_permuted[j, ii, 5]308log_p_rr = u_prim_permuted[j, ii, 6]309310# Compute required mean values311# We inline the logarithmic mean to allow LoopVectorization.jl to optimize312# it efficiently. This is equivalent to313# rho_mean = ln_mean(rho_ll, rho_rr)314x1 = rho_ll315log_x1 = log_rho_ll316y1 = rho_rr317log_y1 = log_rho_rr318x1_plus_y1 = x1 + y1319y1_minus_x1 = y1 - x1320z1 = y1_minus_x1^2 / x1_plus_y1^2321special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))322regular_path1 = y1_minus_x1 / (log_y1 - log_x1)323rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)324325# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`326# in exact arithmetic since327# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)328# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)329# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)330x2 = rho_ll * p_rr331log_x2 = log_rho_ll + log_p_rr332y2 = rho_rr * p_ll333log_y2 = log_rho_rr + log_p_ll334x2_plus_y2 = x2 + y2335y2_minus_x2 = y2 - x2336z2 = y2_minus_x2^2 / x2_plus_y2^2337special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2338regular_path2 = (log_y2 - log_x2) / y2_minus_x2339inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)340341v1_avg = 0.5 * (v1_ll + v1_rr)342v2_avg = 0.5 * (v2_ll + v2_rr)343p_avg = 0.5 * (p_ll + p_rr)344velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)345346# Calculate fluxes depending on Cartesian orientation347f1 = rho_mean * v1_avg348f2 = f1 * v1_avg + p_avg349f3 = f1 * v2_avg350f4 = f1 *351(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +3520.5 * (p_ll * v1_rr + p_rr * v1_ll)353354# Add scaled fluxes to RHS355factor_i = alpha * derivative_split[i, ii]356du_permuted[j, i, 1] += factor_i * f1357du_permuted[j, i, 2] += factor_i * f2358du_permuted[j, i, 3] += factor_i * f3359du_permuted[j, i, 4] += factor_i * f4360361factor_ii = alpha * derivative_split[ii, i]362du_permuted[j, ii, 1] += factor_ii * f1363du_permuted[j, ii, 2] += factor_ii * f2364du_permuted[j, ii, 3] += factor_ii * f3365du_permuted[j, ii, 4] += factor_ii * f4366end367end368369@turbo for v in eachvariable(equations),370j in eachnode(dg),371i in eachnode(dg)372373du[i, j, v] = du_permuted[j, i, v]374end375376# y direction377# The memory layout is already optimal for SIMD vectorization in this loop.378for j in eachnode(dg), jj in (j + 1):nnodes(dg)379@turbo for i in eachnode(dg)380rho_ll = u_prim[i, j, 1]381v1_ll = u_prim[i, j, 2]382v2_ll = u_prim[i, j, 3]383p_ll = u_prim[i, j, 4]384log_rho_ll = u_prim[i, j, 5]385log_p_ll = u_prim[i, j, 6]386387rho_rr = u_prim[i, jj, 1]388v1_rr = u_prim[i, jj, 2]389v2_rr = u_prim[i, jj, 3]390p_rr = u_prim[i, jj, 4]391log_rho_rr = u_prim[i, jj, 5]392log_p_rr = u_prim[i, jj, 6]393394# Compute required mean values395# We inline the logarithmic mean to allow LoopVectorization.jl to optimize396# it efficiently. This is equivalent to397# rho_mean = ln_mean(rho_ll, rho_rr)398x1 = rho_ll399log_x1 = log_rho_ll400y1 = rho_rr401log_y1 = log_rho_rr402x1_plus_y1 = x1 + y1403y1_minus_x1 = y1 - x1404z1 = y1_minus_x1^2 / x1_plus_y1^2405special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))406regular_path1 = y1_minus_x1 / (log_y1 - log_x1)407rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)408409# Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`410# in exact arithmetic since411# log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)412# = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)413# inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)414x2 = rho_ll * p_rr415log_x2 = log_rho_ll + log_p_rr416y2 = rho_rr * p_ll417log_y2 = log_rho_rr + log_p_ll418x2_plus_y2 = x2 + y2419y2_minus_x2 = y2 - x2420z2 = y2_minus_x2^2 / x2_plus_y2^2421special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2422regular_path2 = (log_y2 - log_x2) / y2_minus_x2423inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)424425v1_avg = 0.5 * (v1_ll + v1_rr)426v2_avg = 0.5 * (v2_ll + v2_rr)427p_avg = 0.5 * (p_ll + p_rr)428velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr)429430# Calculate fluxes depending on Cartesian orientation431f1 = rho_mean * v2_avg432f2 = f1 * v1_avg433f3 = f1 * v2_avg + p_avg434f4 = f1 *435(velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) +4360.5 * (p_ll * v2_rr + p_rr * v2_ll)437438# Add scaled fluxes to RHS439factor_j = alpha * derivative_split[j, jj]440du[i, j, 1] += factor_j * f1441du[i, j, 2] += factor_j * f2442du[i, j, 3] += factor_j * f3443du[i, j, 4] += factor_j * f4444445factor_jj = alpha * derivative_split[jj, j]446du[i, jj, 1] += factor_jj * f1447du[i, jj, 2] += factor_jj * f2448du[i, jj, 3] += factor_jj * f3449du[i, jj, 4] += factor_jj * f4450end451end452453# Finally, we add the temporary RHS computed here to the global RHS in the454# given `element`.455@turbo for v in eachvariable(equations),456j in eachnode(dg),457i in eachnode(dg)458459_du[v, i, j, element] += du[i, j, v]460end461end462463464