! Copyright (c) 2012-2013 Manuel Hasert <m.hasert@grs-sim.de>
! Copyright (c) 2012-2017, 2019-2020 Kannan Masilamani <kannan.masilamani@uni-siegen.de>
! Copyright (c) 2012-2013 Simon Zimny <s.zimny@grs-sim.de>
! Copyright (c) 2014-2016 Jiaxing Qi <jiaxing.qi@uni-siegen.de>
! Copyright (c) 2016 Tobias Schneider <tobias1.schneider@student.uni-siegen.de>
! Copyright (c) 2017, 2019 Harald Klimach <harald.klimach@uni-siegen.de>
! Copyright (c) 2018 Raphael Haupt <raphael.haupt@uni-siegen.de>
! Copyright (c) 2020 Peter Vitt <peter.vitt2@uni-siegen.de>
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions are met:
!
! 1. Redistributions of source code must retain the above copyright notice,
! this list of conditions and the following disclaimer.
!
! 2. Redistributions in binary form must reproduce the above copyright notice,
! this list of conditions and the following disclaimer in the documentation
! and/or other materials provided with the distribution.
!
! THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF SIEGEN “AS IS” AND ANY EXPRESS
! OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
! OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
! IN NO EVENT SHALL UNIVERSITY OF SIEGEN OR CONTRIBUTORS BE LIABLE FOR ANY
! DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
! ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! ****************************************************************************** !
!> author: Jiaxing Qi
!! Routines and parameter definitions for the standard D3Q27 model
?? include 'header/lbm_macros.inc'
?? include 'header/lbm_d3q27Macros.inc'
module mus_d3q27_module
  use iso_c_binding,            only: c_f_pointer

  ! include treelm modules
  use env_module,            only: rk
  use tem_varSys_module,     only: tem_varSys_type, tem_varSys_op_type
  use tem_param_module,      only: div1_2, div1_3, div1_36, div1_8, div3_4h, &
    &                              div2_3, cs2inv, cs4inv, t2cs2inv, t2cs4inv, &
    &                              rho0
  use tem_logging_module,    only: logUnit

  ! include musubi modules
  use mus_field_prop_module,    only: mus_field_prop_type
  use mus_scheme_layout_module, only: mus_scheme_layout_type
  use mus_param_module,         only: mus_param_type
  use mus_varSys_module,        only: mus_varSys_data_type
  use mus_derVarPos_module,     only: mus_derVarPos_type

  implicit none

  private

  public :: feq_d3q27
  public :: bgk_improved_advRel_d3q27
  public :: bgk_advRel_d3q27
  public :: bgk_advRel_d3q27_incomp
  public :: central_moment
  public :: central_moment_split
  public :: cm_to_pdf
  public :: cumulant_d3q27
  public :: cascaded_d3q27
  public :: trt_advRel_d3q27

  integer,parameter :: QQ   = 27  !< number of pdf directions
?? copy :: q27

contains

! ****************************************************************************** !
  !> D3Q27 equilibirium definition in product form.
  !! Given in paper: The cumulant lattice Boltzmann equation in three dimensions:
  !! Theory and validation, Martin Geier .et al,
  !! Computers and Mathematics with Applicaitons, 2015.
  !! @todo: this can be optimized
  pure function feq_d3q27( i, j, k, rho, u, v, w ) result ( eq )
    !> i, j, k are direction index, taking the value of -1, 0, 1
    integer,       intent(in) :: i, j, k
    !> density and velocity
    real(kind=rk), intent(in) :: rho, u, v, w
    !> equilibirium density
    real(kind=rk)             :: eq
    real(kind=rk)             :: rho_part, u_part, v_part, w_part, cs

    cs = div1_3

    rho_part = -rho * (-1)**(abs(i)+abs(j)+abs(k)) &
      &        / ( (abs(i)+1) * (abs(j)+1) * (abs(k)+1) )

    u_part = abs(i) - 1 + cs + i*u + u*u
    v_part = abs(j) - 1 + cs + j*v + v*v
    w_part = abs(k) - 1 + cs + k*w + w*w

    eq = rho_part * u_part * v_part * w_part

  end function feq_d3q27
! ****************************************************************************** !


! ****************************************************************************** !
  !> Improved BGK model (with Galilean correction term)
  !! taken from Martin Geier cumulent paper 2015
  !! Geier, M., Schönherr, M., Pasquali, A., & Krafczyk, M. (2015).
  !! The cumulant lattice Boltzmann equation in three dimensions : Theory and
  !! validation. Computers and Mathematics with Applications.
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_improved_advRel_d3q27( fieldProp, inState, outState,    &
    &                                   auxField, neigh, nElems, nSolve, &
    &                                   level, layout, params, varSys,   &
    &                                   derVarPos                        )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem
    integer :: nScalars
    real(kind=rk) :: f(-1:1,-1:1,-1:1)
    real(kind=rk) :: u, v, w , u2, v2, w2
    real(kind=rk) :: rho, rho_omg
    real(kind=rk) :: inv_rho
    real(kind=rk) :: omega, cmpl_o, fac
    real(kind=rk) :: sumX1, sumXN, X0, X1, XN
    real(kind=rk) :: sumY1, sumYN, Y0, Y1, YN
    real(kind=rk) :: sumZ1, sumZN, Z0, Z1, ZN
    real(kind=rk) :: m200, m020, m002
    real(kind=rk) :: Gx, Gy, Gz
    real(kind=rk) :: x0y0, x0y1, x0yn, x1y0, xny0, x1y1, x1yn, xny1, xnyn
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

!$omp do schedule(static)
    !NEC$ ivdep
?? IF (SOA) THEN
?? copy :: dir_vector
?? ELSE
?? copy :: dir_novec
?? END IF
    nodeloop: do iElem = 1, nSolve

      f(-1, 0, 0) = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 0) = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0,-1) = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 0) = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 0) = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 1) = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))

      f( 0,-1,-1) = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 1) = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1,-1) = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 1) = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0,-1) = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0,-1) = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0, 1) = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 1) = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 0) = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 0) = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 0) = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 0) = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))

      f(-1,-1,-1) = inState(?FETCH?( qNNN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 1) = inState(?FETCH?( qNN1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1,-1) = inState(?FETCH?( qN1N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 1) = inState(?FETCH?( qN11, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1,-1) = inState(?FETCH?( q1NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 1) = inState(?FETCH?( q1N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1,-1) = inState(?FETCH?( q11N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 1) = inState(?FETCH?( q111, 1, iElem, QQ, nScalars, nElems,neigh ))

      f( 0, 0, 0) = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u = auxField(elemOff + vel_pos(1))
      v = auxField(elemOff + vel_pos(2))
      w = auxField(elemOff + vel_pos(3))

      ! u v square
      u2 = u*u
      v2 = v*v
      w2 = w*w

      sumX1 = sum(f( 1, :, :))
      sumXN = sum(f(-1, :, :))

      sumY1 = sum(f(:, 1, :))
      sumYN = sum(f(:,-1, :))

      sumZ1 = sum(f(:, :, 1))
      sumZN = sum(f(:, :,-1))

      ! second moments, by equation A.7, A.8 and A.9
      inv_rho = 1.0_rk / rho
      m200 = (sumX1 + sumXN) * inv_rho
      m020 = (sumY1 + sumYN) * inv_rho
      m002 = (sumZ1 + sumZN) * inv_rho

      ! relaxation rate
      omega  = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      cmpl_o = 1._rk - omega
      fac    = 4.5_rk - 2.25_rk * omega

      !----------------------------------------------------------------
      ! calculate Galilean correction term, by equation A.13, A.14, A.15
      Gx = fac * u2 * ( m200 - div1_3 - u2 )
      Gy = fac * v2 * ( m020 - div1_3 - v2 )
      Gz = fac * w2 * ( m002 - div1_3 - w2 )
      !----------------------------------------------------------------

      ! X Y Z components of eq
      ! by equation A.19 - A.21
      X0 = -div2_3 + u2 + Gx
      X1 = - ( X0 + 1.0_rk + u ) * 0.5_rk
      XN = X1 + u

      Y0 = -div2_3 + v2 + Gy
      Y1 = - ( Y0 + 1.0_rk + v ) * 0.5_rk
      YN = Y1 + v

      Z0 = -div2_3 + w2 + Gz
      Z1 = - ( Z0 + 1.0_rk + w ) * 0.5_rk
      ZN = Z1 + w

      ! rho * omega
      rho_omg = rho * omega
      X0 = -rho_omg * X0
      X1 = -rho_omg * X1
      XN = -rho_omg * XN

! fEq000 = X0 * Y0 * Z0
! fEq00N = X0 * Y0 * ZN
! fEq001 = X0 * Y0 * Z1
X0Y0 = X0 * Y0
outState( ?SAVE?( q000,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0, 0, 0) + X0Y0*Z0
outState( ?SAVE?( q00N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0, 0,-1) + X0Y0*ZN
outState( ?SAVE?( q001,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0, 0, 1) + X0Y0*Z1

! fEq010 = X0 * Y1 * Z0
! fEq011 = X0 * Y1 * Z1
! fEq01N = X0 * Y1 * ZN
X0Y1 = X0 * Y1
outState( ?SAVE?( q010,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0, 1, 0) + X0Y1*Z0
outState( ?SAVE?( q011,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0, 1, 1) + X0Y1*Z1
outState( ?SAVE?( q01N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0, 1,-1) + X0Y1*ZN

! fEq0N0 = X0 * YN * Z0
! fEq0N1 = X0 * YN * Z1
! fEq0NN = X0 * YN * ZN
X0YN = X0 * YN
outState( ?SAVE?( q0N0,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0,-1, 0) + X0YN*Z0
outState( ?SAVE?( q0N1,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0,-1, 1) + X0YN*Z1
outState( ?SAVE?( q0NN,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 0,-1,-1) + X0YN*ZN

! fEq100 = X1 * Y0 * Z0
! fEq10N = X1 * Y0 * ZN
! fEq101 = X1 * Y0 * Z1
X1Y0 = X1 * Y0
outState( ?SAVE?( q100,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1, 0, 0) + X1Y0*Z0
outState( ?SAVE?( q101,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1, 0, 1) + X1Y0*Z1
outState( ?SAVE?( q10N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1, 0,-1) + X1Y0*ZN

! fEqN00 = XN * Y0 * Z0
! fEqN01 = XN * Y0 * Z1
! fEqN0N = XN * Y0 * ZN
XNY0 = XN * Y0
outState( ?SAVE?( qN00,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1, 0, 0) + XNY0*Z0
outState( ?SAVE?( qN01,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1, 0, 1) + XNY0*Z1
outState( ?SAVE?( qN0N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1, 0,-1) + XNY0*ZN

! fEq110 = X1 * Y1 * Z0
! fEq111 = X1 * Y1 * Z1
! fEq11N = X1 * Y1 * ZN
X1Y1 = X1 * Y1
outState( ?SAVE?( q110,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1, 1, 0) + X1Y1*Z0
outState( ?SAVE?( q111,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1, 1, 1) + X1Y1*Z1
outState( ?SAVE?( q11N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1, 1,-1) + X1Y1*ZN

! fEq1N0 = X1 * YN * Z0
! fEq1N1 = X1 * YN * Z1
! fEq1NN = X1 * YN * ZN
X1YN = X1 * YN
outState( ?SAVE?( q1N0,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1,-1, 0) + X1YN*Z0
outState( ?SAVE?( q1N1,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1,-1, 1) + X1YN*Z1
outState( ?SAVE?( q1NN,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f( 1,-1,-1) + X1YN*ZN

! fEqN10 = XN * Y1 * Z0
! fEqN11 = XN * Y1 * Z1
! fEqN1N = XN * Y1 * ZN
XNY1 = XN * Y1
outState( ?SAVE?( qN10,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1, 1, 0) + XNY1*Z0
outState( ?SAVE?( qN11,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1, 1, 1) + XNY1*Z1
outState( ?SAVE?( qN1N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1, 1,-1) + XNY1*ZN

! fEqNN0 = XN * YN * Z0
! fEqNN1 = XN * YN * Z1
! fEqNNN = XN * YN * ZN
XNYN = XN * YN
outState( ?SAVE?( qNN0,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1,-1, 0) + XNYN*Z0
outState( ?SAVE?( qNN1,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1,-1, 1) + XNYN*Z1
outState( ?SAVE?( qNNN,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1,-1,-1) + XNYN*ZN

    end do nodeloop
!$omp end do nowait

  end subroutine bgk_improved_advRel_d3q27
! ****************************************************************************** !


! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q27 model with BGK with
  !! standard equilibrium function
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d3q27( fieldProp, inState, outState, auxField, &
    &                          neigh, nElems, nSolve, level, layout,   &
    &                          params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem, iDir, nScalars
    real(kind=rk) :: pdfTmp(27)
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    ! derived constants
    real(kind=rk) :: usq, ucx
    real(kind=rk) :: fEq(27) !< equilibrium distribution
    real(kind=rk) :: omega
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

    !$omp do schedule(static)

    !NEC$ ivdep
?? IF (SOA) THEN
?? copy :: dir_vector
?? ELSE
?? copy :: dir_novec
?? END IF

    nodeloop: do iElem = 1,nSolve
      ! First load all local values into temp array
      ! Generic! PUSH+PULL is possible
      pdfTmp( qN00 ) = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q0N0 ) = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q00N ) = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q100 ) = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q010 ) = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q001 ) = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))

      pdfTmp( q0NN ) = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q0N1 ) = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q01N ) = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q011 ) = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN0N ) = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q10N ) = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN01 ) = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q101 ) = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qNN0 ) = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN10 ) = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q1N0 ) = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q110 ) = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))

      pdfTmp( qNNN ) = inState(?FETCH?( qNNN, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qNN1 ) = inState(?FETCH?( qNN1, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN1N ) = inState(?FETCH?( qN1N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN11 ) = inState(?FETCH?( qN11, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q1NN ) = inState(?FETCH?( q1NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q1N1 ) = inState(?FETCH?( q1N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q11N ) = inState(?FETCH?( q11N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q111 ) = inState(?FETCH?( q111, 1, iElem, QQ, nScalars, nElems,neigh ))

      pdfTmp( q000 ) = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))
      u_z = auxField(elemOff + vel_pos(3))


      ! Calculate the equilibrium distribution function
      !fEq(:) = getEquilibrium( density = rho, velocity = (/u_x, u_y, u_z /), layout = layout)
      ! square of velocity
      usq = u_x*u_x + u_y*u_y + u_z*u_z
      do iDir = 1, QQ

        ! velocity times lattice unit velocity
        ucx =   layout%fStencil%cxDirRK(1,iDir) * u_x       &
          &   + layout%fStencil%cxDirRK(2,iDir) * u_y       &
          &   + layout%fStencil%cxDirRK(3,iDir) * u_z

        ! calculate equilibrium density
        fEq( iDir ) = layout%weight( iDir ) * rho * ( 1._rk + ucx*cs2inv        &
          &         + ucx*ucx*t2cs4inv - usq*t2cs2inv )

      enddo

      ! omega
      omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)

      ! relaxation
      outState(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( qN00 ) - omega*(pdfTmp( qN00 ) - fEq( qN00 ))
      outState(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q0N0 ) - omega*(pdfTmp( q0N0 ) - fEq( q0N0 ))
      outState(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q00N ) - omega*(pdfTmp( q00N ) - fEq( q00N ))
      outState(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q100 ) - omega*(pdfTmp( q100 ) - fEq( q100 ))
      outState(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q010 ) - omega*(pdfTmp( q010 ) - fEq( q010 ))
      outState(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q001 ) - omega*(pdfTmp( q001 ) - fEq( q001 ))
      outState(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q0NN ) - omega*(pdfTmp( q0NN ) - fEq( q0NN ))
      outState(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q0N1 ) - omega*(pdfTmp( q0N1 ) - fEq( q0N1 ))
      outState(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q01N ) - omega*(pdfTmp( q01N ) - fEq( q01N ))
      outState(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q011 ) - omega*(pdfTmp( q011 ) - fEq( q011 ))
      outState(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN0N ) - omega*(pdfTmp( qN0N ) - fEq( qN0N ))
      outState(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q10N ) - omega*(pdfTmp( q10N ) - fEq( q10N ))
      outState(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN01 ) - omega*(pdfTmp( qN01 ) - fEq( qN01 ))
      outState(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q101 ) - omega*(pdfTmp( q101 ) - fEq( q101 ))
      outState(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qNN0 ) - omega*(pdfTmp( qNN0 ) - fEq( qNN0 ))
      outState(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN10 ) - omega*(pdfTmp( qN10 ) - fEq( qN10 ))
      outState(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q1N0 ) - omega*(pdfTmp( q1N0 ) - fEq( q1N0 ))
      outState(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q110 ) - omega*(pdfTmp( q110 ) - fEq( q110 ))

      outState(?SAVE?(qNNN,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qNNN ) - omega*(pdfTmp( qNNN ) - fEq( qNNN ))
      outState(?SAVE?(qNN1,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qNN1 ) - omega*(pdfTmp( qNN1 ) - fEq( qNN1 ))
      outState(?SAVE?(qN1N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN1N ) - omega*(pdfTmp( qN1N ) - fEq( qN1N ))
      outState(?SAVE?(qN11,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN11 ) - omega*(pdfTmp( qN11 ) - fEq( qN11 ))
      outState(?SAVE?(q1NN,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q1NN ) - omega*(pdfTmp( q1NN ) - fEq( q1NN ))
      outState(?SAVE?(q1N1,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q1N1 ) - omega*(pdfTmp( q1N1 ) - fEq( q1N1 ))
      outState(?SAVE?(q11N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q11N ) - omega*(pdfTmp( q11N ) - fEq( q11N ))
      outState(?SAVE?(q111,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q111 ) - omega*(pdfTmp( q111 ) - fEq( q111 ))

      outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) =                      &
        &      pdfTmp( q000 ) - omega*(pdfTmp( q000 ) - fEq( q000 ))
    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q27
! ****************************************************************************** !

! ****************************************************************************** !
  !> Advection relaxation routine for the D3Q27 model with BGK with
  !! standard equilibrium function for incompressible model
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d3q27_incomp( fieldProp, inState, outState, auxField, &
    &                                 neigh, nElems, nSolve, level, layout,   &
    &                                 params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem, iDir, nScalars
    real(kind=rk) :: pdfTmp(27)
    real(kind=rk) :: rho     ! local density
    real(kind=rk) :: u_x     ! local x-velocity
    real(kind=rk) :: u_y     ! local y-velocity
    real(kind=rk) :: u_z     ! local z-velocity
    ! derived constants
    real(kind=rk) :: usq, ucx
    real(kind=rk) :: fEq(27) !< equilibrium distribution
    real(kind=rk) :: omega
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

    !$omp do schedule(static)

    !NEC$ ivdep
?? IF (SOA) THEN
?? copy :: dir_vector
?? ELSE
?? copy :: dir_novec
?? END IF

    nodeloop: do iElem = 1,nSolve
      ! First load all local values into temp array
      ! Generic! PUSH+PULL is possible
      pdfTmp( qN00 ) = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q0N0 ) = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q00N ) = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q100 ) = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q010 ) = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q001 ) = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))

      pdfTmp( q0NN ) = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q0N1 ) = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q01N ) = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q011 ) = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN0N ) = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q10N ) = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN01 ) = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q101 ) = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qNN0 ) = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN10 ) = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q1N0 ) = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q110 ) = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))

      pdfTmp( qNNN ) = inState(?FETCH?( qNNN, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qNN1 ) = inState(?FETCH?( qNN1, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN1N ) = inState(?FETCH?( qN1N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( qN11 ) = inState(?FETCH?( qN11, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q1NN ) = inState(?FETCH?( q1NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q1N1 ) = inState(?FETCH?( q1N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q11N ) = inState(?FETCH?( q11N, 1, iElem, QQ, nScalars, nElems,neigh ))
      pdfTmp( q111 ) = inState(?FETCH?( q111, 1, iElem, QQ, nScalars, nElems,neigh ))

      pdfTmp( q000 ) = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))
      u_z = auxField(elemOff + vel_pos(3))


      ! Calculate the equilibrium distribution function
      !fEq(:) = getEquilibrium( density = rho, velocity = (/u_x, u_y, u_z /), layout = layout)
      ! square of velocity
      usq = u_x*u_x + u_y*u_y + u_z*u_z
      do iDir = 1, QQ

        ! velocity times lattice unit velocity
        ucx =   layout%fStencil%cxDirRK(1,iDir) * u_x       &
          &   + layout%fStencil%cxDirRK(2,iDir) * u_y       &
          &   + layout%fStencil%cxDirRK(3,iDir) * u_z

        ! calculate equilibrium density
        fEq( iDir ) = layout%weight( iDir ) * ( rho + rho0*(ucx*cs2inv        &
          &         + ucx*ucx*t2cs4inv - usq*t2cs2inv ) )

      enddo

      ! omega
      omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)

      ! relaxation
      outState(?SAVE?(qN00,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( qN00 ) - omega*(pdfTmp( qN00 ) - fEq( qN00 ))
      outState(?SAVE?(q0N0,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q0N0 ) - omega*(pdfTmp( q0N0 ) - fEq( q0N0 ))
      outState(?SAVE?(q00N,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q00N ) - omega*(pdfTmp( q00N ) - fEq( q00N ))
      outState(?SAVE?(q100,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q100 ) - omega*(pdfTmp( q100 ) - fEq( q100 ))
      outState(?SAVE?(q010,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q010 ) - omega*(pdfTmp( q010 ) - fEq( q010 ))
      outState(?SAVE?(q001,1,iElem,QQ,QQ,nElems,neigh )) =                     &
        &      pdfTmp( q001 ) - omega*(pdfTmp( q001 ) - fEq( q001 ))
      outState(?SAVE?(q0NN,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q0NN ) - omega*(pdfTmp( q0NN ) - fEq( q0NN ))
      outState(?SAVE?(q0N1,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q0N1 ) - omega*(pdfTmp( q0N1 ) - fEq( q0N1 ))
      outState(?SAVE?(q01N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q01N ) - omega*(pdfTmp( q01N ) - fEq( q01N ))
      outState(?SAVE?(q011,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q011 ) - omega*(pdfTmp( q011 ) - fEq( q011 ))
      outState(?SAVE?(qN0N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN0N ) - omega*(pdfTmp( qN0N ) - fEq( qN0N ))
      outState(?SAVE?(q10N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q10N ) - omega*(pdfTmp( q10N ) - fEq( q10N ))
      outState(?SAVE?(qN01,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN01 ) - omega*(pdfTmp( qN01 ) - fEq( qN01 ))
      outState(?SAVE?(q101,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q101 ) - omega*(pdfTmp( q101 ) - fEq( q101 ))
      outState(?SAVE?(qNN0,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qNN0 ) - omega*(pdfTmp( qNN0 ) - fEq( qNN0 ))
      outState(?SAVE?(qN10,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN10 ) - omega*(pdfTmp( qN10 ) - fEq( qN10 ))
      outState(?SAVE?(q1N0,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q1N0 ) - omega*(pdfTmp( q1N0 ) - fEq( q1N0 ))
      outState(?SAVE?(q110,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q110 ) - omega*(pdfTmp( q110 ) - fEq( q110 ))

      outState(?SAVE?(qNNN,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qNNN ) - omega*(pdfTmp( qNNN ) - fEq( qNNN ))
      outState(?SAVE?(qNN1,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qNN1 ) - omega*(pdfTmp( qNN1 ) - fEq( qNN1 ))
      outState(?SAVE?(qN1N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN1N ) - omega*(pdfTmp( qN1N ) - fEq( qN1N ))
      outState(?SAVE?(qN11,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( qN11 ) - omega*(pdfTmp( qN11 ) - fEq( qN11 ))
      outState(?SAVE?(q1NN,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q1NN ) - omega*(pdfTmp( q1NN ) - fEq( q1NN ))
      outState(?SAVE?(q1N1,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q1N1 ) - omega*(pdfTmp( q1N1 ) - fEq( q1N1 ))
      outState(?SAVE?(q11N,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q11N ) - omega*(pdfTmp( q11N ) - fEq( q11N ))
      outState(?SAVE?(q111,1,iElem,QQ,QQ,nElems,neigh))  =                     &
        &      pdfTmp( q111 ) - omega*(pdfTmp( q111 ) - fEq( q111 ))

      outState(?SAVE?(q000,1,iElem,QQ,QQ,nElems,neigh)) =                      &
        &      pdfTmp( q000 ) - omega*(pdfTmp( q000 ) - fEq( q000 ))
    enddo nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d3q27_incomp
! ****************************************************************************** !

! ****************************************************************************** !
  !> Calculating central moment by spliting among directions.
  !! This follows equations 43, 44, 45 in cumulent paper (Geier .et al 2015)
  !! We first do x direction for better performance.
  pure function central_moment_split( f, a, b, g, ux, uy, uz ) result ( kappa )
    !> PDF
    real(kind=rk), intent(in) :: f(-1:1, -1:1, -1:1)
    !> order of central moments
    integer, intent(in) :: a, b, g
    real(kind=rk), intent(in) :: ux, uy, uz
    ! ---------------------------------------------------------------------------
    real(kind=rk) :: ka(-1:1, -1:1), kb(-1:1)
    real(kind=rk) :: kappa
    integer :: ii, jj, kk
    real(kind=rk), parameter :: ii_rk(-1:1) = [ -1._rk, 0._rk, 1._rk ]
    ! ---------------------------------------------------------------------------

    ka = 0.0_rk
    kb = 0.0_rk
    kappa = 0.0_rk

    do kk = -1, 1
      do jj = -1, 1
        do ii = -1, 1
          ka(jj,kk) = ka(jj,kk) + f( ii, jj, kk ) * ( ( ii_rk(ii) - ux ) ** a )
        end do
        kb(kk) = kb(kk) + ka(jj,kk) * ( ( ii_rk(jj) - uy ) ** b )
      end do
      kappa = kappa + kb(kk) * ( ( ii_rk(kk) - uz ) ** g )
    end do

  end function
! ****************************************************************************** !

! ****************************************************************************** !
  !> Calculating central moment.
  !! This follows equations 21 in cumulent paper ( Geier .et al 2015 )
  pure function central_moment( f, a, b, g, ux, uy, uz ) result ( kappa )
    !> PDF
    real(kind=rk), intent(in) :: f(-1:1, -1:1, -1:1)
    !> order of central moments
    integer, intent(in) :: a, b, g
    real(kind=rk), intent(in) :: ux, uy, uz
    ! ---------------------------------------------------------------------------
    real(kind=rk) :: kappa
    integer :: ii, jj, kk
    real(kind=rk), parameter :: ii_rk(-1:1) = [ -1._rk, 0._rk, 1._rk ]
    ! ---------------------------------------------------------------------------

    kappa = 0.0_rk

    do kk = -1, 1
      do jj = -1, 1
        do ii = -1, 1
          kappa = kappa + f( ii, jj, kk ) * ( ( ii_rk(ii) - ux ) ** a )  &
            &                             * ( ( ii_rk(jj) - uy ) ** b )  &
            &                             * ( ( ii_rk(kk) - uz ) ** g )
        end do
      end do
    end do

  end function
! ****************************************************************************** !

! ****************************************************************************** !
  !> No comment yet!
  !!
  !! TODO Add comment!
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine cumulant_d3q27( fieldProp, inState, outState, auxField, &
    &                        neigh, nElems, nSolve, level, layout,   &
    &                        params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem, ii, jj, kk, nScalars
    real(kind=rk) :: ux, uy, uz, rho, inv_rho, omega
    real(kind=rk) :: dxu, dyv, dzw, AA, BB, CC
    ! PDF
    real(kind=rk) :: f(-1:1,-1:1,-1:1)
    ! k = central moment, c = cumulant
    real(kind=rk) :: k(0:2,0:2,0:2), c(0:2,0:2,0:2)
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

    nodeloop: do iElem = 1, nSolve

      ! perform streaming step
      f(-1, 0, 0) = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 0) = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0,-1) = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 0) = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 0) = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 1) = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1,-1) = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 1) = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1,-1) = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 1) = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0,-1) = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0,-1) = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0, 1) = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 1) = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 0) = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 0) = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 0) = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 0) = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1,-1) = inState(?FETCH?( qNNN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 1) = inState(?FETCH?( qNN1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1,-1) = inState(?FETCH?( qN1N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 1) = inState(?FETCH?( qN11, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1,-1) = inState(?FETCH?( q1NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 1) = inState(?FETCH?( q1N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1,-1) = inState(?FETCH?( q11N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 1) = inState(?FETCH?( q111, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 0) = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! access rho and velocity from auxField-----------------------------------
      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      inv_rho = 1.0_rk / rho
      ! local x-, y- and z-velocity
      ux = auxField(elemOff + vel_pos(1))
      uy = auxField(elemOff + vel_pos(2))
      uz = auxField(elemOff + vel_pos(3))
      ! ------------------------------------------------------------------------

      ! Central moments ---------------------------------------------------------
      ! eq 43 - 45
      do kk = 0,2
        do jj = 0,2
          do ii = 0,2
            k( ii, jj, kk ) = central_moment_split( f, ii, jj, kk, ux, uy, uz )
          end do
        end do
      end do
      ! Central moments ---------------------------------------------------------


      ! this omega refers to the w1 in the paper
      ! other relaxation parameters are assumed to be 1, thus omitted
      omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)

      ! Cumulants and Collision -------------------------------------------------
      ! Not sure how to calcuate the post collision value for 0th and 1st order
      c(0,0,0) = k(0,0,0)
      c(1,0,0) = k(1,0,0)
      c(0,1,0) = k(0,1,0)
      c(0,0,1) = k(0,0,1)
      ! Relaxation for 2nd order terms
      c(1,1,0) = ( 1.0_rk - omega ) * k(1,1,0)
      c(1,0,1) = ( 1.0_rk - omega ) * k(1,0,1)
      c(0,1,1) = ( 1.0_rk - omega ) * k(0,1,1)

      ! eq 58 - 60
      Dxu = -omega * 0.5_rk * inv_rho * ( 2.0_rk * k(2,0,0) - k(0,2,0) - k(0,0,2))&
        &   - 0.5_rk * inv_rho * ( k(2,0,0) + k(0,2,0) + k(0,0,2) - k(0,0,0) )
      Dyv = Dxu + 3.0_rk * omega * 0.5_rk * inv_rho * ( k(2,0,0) - k(0,2,0) )
      Dzw = Dxu + 3.0_rk * omega * 0.5_rk * inv_rho * ( k(2,0,0) - k(0,0,2) )

      ! eq 61 - 63
      ! first calculate the rights parts
      AA = ( 1.0_rk - omega ) * ( k(2,0,0) - k(0,2,0) ) &
        & - 3.0_rk * rho * ( 1.0_rk - omega*0.5_rk ) * ( ux*ux*Dxu - uy*uy*Dyv )
      BB = ( 1.0_rk - omega ) * ( k(2,0,0) - k(0,0,2) ) &
        & - 3.0_rk * rho * ( 1.0_rk - omega*0.5_rk ) * ( ux*ux*Dxu - uz*uz*Dzw )
      CC = k(0,0,0) - 1.5_rk * rho * ( ux*ux*Dxu + uy*uy*Dyv + uz*uz*Dzw )

      ! then solve the three moments
      c(2,0,0) = (AA+BB+CC) * div1_3
      c(0,2,0) = (BB+CC) * div1_3 - AA * div2_3
      c(0,0,2) = (AA+CC) * div1_3 - BB * div2_3

      ! for 3rd order or higher, as relaxation == 1, they become simply 0
      c(2,1,0) = 0.0_rk ! 3rd order
      c(2,0,1) = 0.0_rk
      c(1,2,0) = 0.0_rk
      c(0,2,1) = 0.0_rk
      c(1,0,2) = 0.0_rk
      c(0,1,2) = 0.0_rk
      c(1,1,1) = 0.0_rk

      c(2,1,1) = 0.0_rk ! 4th order
      c(1,2,1) = 0.0_rk
      c(1,1,2) = 0.0_rk
      c(2,2,0) = 0.0_rk
      c(2,0,2) = 0.0_rk
      c(0,2,2) = 0.0_rk

      c(1,2,2) = 0.0_rk ! 5th order
      c(2,1,2) = 0.0_rk
      c(2,2,1) = 0.0_rk
      c(2,2,2) = 0.0_rk ! 6th order
      ! Collision --------------------------------------------------------------

      ! Back to central moment -------------------------------------------------
      k = c
      ! k(1,0,0) = -k(1,0,0)
      ! k(0,1,0) = -k(0,1,0)
      ! k(0,0,1) = -k(0,0,1)
      ! get 4th and 6th order from 2nd order
      k(2,1,1) = ( k(2,0,0)*k(0,1,1) + 2.0_rk*k(1,1,0)*k(1,0,1) ) * inv_rho
      k(1,2,1) = ( k(0,2,0)*k(1,0,1) + 2.0_rk*k(1,1,0)*k(0,1,1) ) * inv_rho
      k(1,1,2) = ( k(0,0,2)*k(1,1,0) + 2.0_rk*k(1,0,1)*k(0,1,1) ) * inv_rho
      k(2,2,0) = ( k(2,0,0)*k(0,2,0) + 2.0_rk*k(1,1,0)*k(1,1,0) ) * inv_rho
      k(2,0,2) = ( k(2,0,0)*k(0,0,2) + 2.0_rk*k(1,0,1)*k(1,0,1) ) * inv_rho
      k(0,2,2) = ( k(0,2,0)*k(0,0,2) + 2.0_rk*k(0,1,1)*k(0,1,1) ) * inv_rho
      k(2,2,2) = - (   16.0_rk*k(1,1,0)*k(1,0,1)*k(0,1,1) &
        &          + 4.0_rk*(k(1,0,1)**2*k(0,2,0) + k(0,1,1)**2*k(2,0,0) + k(1,1,0)**2*k(0,0,2) ) &
        &          + 2.0_rk*k(2,0,0)*k(0,2,0)*k(0,0,2) ) * inv_rho * inv_rho
      ! Back to central moment -------------------------------------------------

      ! Back to PDF ------------------------------------------------------------
      f = cm_to_pdf( k, ux, uy, uz )
      ! Back to PDF ------------------------------------------------------------

      ! write to state array
      outState( ?SAVE?( q000,1,iElem,QQ,QQ,nElems,neigh )) = f( 0, 0, 0)
      outState( ?SAVE?( qN00,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 0, 0)
      outState( ?SAVE?( q100,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 0, 0)
      outState( ?SAVE?( q0N0,1,iElem,QQ,QQ,nElems,neigh )) = f( 0,-1, 0)
      outState( ?SAVE?( q010,1,iElem,QQ,QQ,nElems,neigh )) = f( 0, 1, 0)
      outState( ?SAVE?( q00N,1,iElem,QQ,QQ,nElems,neigh )) = f( 0, 0,-1)
      outState( ?SAVE?( q001,1,iElem,QQ,QQ,nElems,neigh )) = f( 0, 0, 1)
      outState( ?SAVE?( q0NN,1,iElem,QQ,QQ,nElems,neigh )) = f( 0,-1,-1)
      outState( ?SAVE?( q01N,1,iElem,QQ,QQ,nElems,neigh )) = f( 0, 1,-1)
      outState( ?SAVE?( q0N1,1,iElem,QQ,QQ,nElems,neigh )) = f( 0,-1, 1)
      outState( ?SAVE?( q011,1,iElem,QQ,QQ,nElems,neigh )) = f( 0, 1, 1)
      outState( ?SAVE?( qN0N,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 0,-1)
      outState( ?SAVE?( q10N,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 0,-1)
      outState( ?SAVE?( qN01,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 0, 1)
      outState( ?SAVE?( q101,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 0, 1)
      outState( ?SAVE?( qNN0,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,-1, 0)
      outState( ?SAVE?( q1N0,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,-1, 0)
      outState( ?SAVE?( qN10,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 1, 0)
      outState( ?SAVE?( q110,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 1, 0)
      outState( ?SAVE?( q1NN,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,-1,-1)
      outState( ?SAVE?( q11N,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 1,-1)
      outState( ?SAVE?( q1N1,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,-1, 1)
      outState( ?SAVE?( q111,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 1, 1)
      outState( ?SAVE?( qNNN,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,-1,-1)
      outState( ?SAVE?( qN1N,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 1,-1)
      outState( ?SAVE?( qNN1,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,-1, 1)
      outState( ?SAVE?( qN11,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 1, 1)

    end do nodeloop

  end subroutine cumulant_d3q27
! ****************************************************************************** !

! ****************************************************************************** !
  pure function cm_to_pdf( k, ux, uy, uz ) result ( f )
    real(kind=rk), intent(in) :: k(0:2,0:2,0:2)
    real(kind=rk), intent(in) :: ux, uy, uz

    real(kind=rk) :: f(-1:1,-1:1,-1:1)
    real(kind=rk) :: g(-1:1, 0:2, 0:2)
    real(kind=rk) :: h(-1:1,-1:1, 0:2)
    integer :: ii, jj
    real(kind=rk) :: uu, mm, nn, pp, qq, rr, u2

    uu = ux*ux
    mm = 1.0_rk - uu
    nn = uu + ux
    pp = uu - ux
    u2 = 2.0_rk * ux
    qq = u2 - 1.0_rk
    rr = u2 + 1.0_rk
    do jj = 0,2
      do ii = 0,2
        g( 0,ii,jj) =   k(0,ii,jj)*mm - k(1,ii,jj)*u2 - k(2,ii,jj)
        g(-1,ii,jj) = ( k(0,ii,jj)*pp + k(1,ii,jj)*qq + k(2,ii,jj) ) * div1_2
        g( 1,ii,jj) = ( k(0,ii,jj)*nn + k(1,ii,jj)*rr + k(2,ii,jj) ) * div1_2
      end do
    end do

    uu = uy*uy
    mm = 1.0_rk - uu
    nn = uu + uy
    pp = uu - uy
    u2 = 2.0_rk * uy
    qq = u2 - 1.0_rk
    rr = u2 + 1.0_rk
    do jj = 0,2
      do ii = -1,1
        h(ii, 0,jj) =   g(ii,0,jj)*mm - g(ii,1,jj)*u2 - g(ii,2,jj)
        h(ii,-1,jj) = ( g(ii,0,jj)*pp + g(ii,1,jj)*qq + g(ii,2,jj) ) * div1_2
        h(ii, 1,jj) = ( g(ii,0,jj)*nn + g(ii,1,jj)*rr + g(ii,2,jj) ) * div1_2
      end do
    end do

    uu = uz*uz
    mm = 1.0_rk - uu
    nn = uu + uz
    pp = uu - uz
    u2 = 2.0_rk * uz
    qq = u2 - 1.0_rk
    rr = u2 + 1.0_rk
    do jj = -1,1
      do ii = -1,1
        f(ii, jj, 0) =   h(ii,jj,0)*mm - h(ii,jj,1)*u2 - h(ii,jj,2)
        f(ii, jj,-1) = ( h(ii,jj,0)*pp + h(ii,jj,1)*qq + h(ii,jj,2) ) * div1_2
        f(ii, jj, 1) = ( h(ii,jj,0)*nn + h(ii,jj,1)*rr + h(ii,jj,2) ) * div1_2
      end do
    end do
  end function
! ****************************************************************************** !

! ****************************************************************************** !
  !> No comment yet!
  !!
  !! TODO Add coment!
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine cascaded_d3q27( fieldProp, inState, outState, auxField, &
    &                        neigh, nElems, nSolve, level, layout,   &
    &                        params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem, ii, jj, kk, nScalars
    real(kind=rk) :: ux, uy, uz, rho, omega, inv_rho
    real(kind=rk) :: dxu, dyv, dzw, AA, BB, CC
    real(kind=rk) ::  f(-1:1,-1:1,-1:1)
    real(kind=rk) :: ff(-1:1,-1:1,-1:1)
    ! k = central moment, c = cumulant
    real(kind=rk) :: k(0:2,0:2,0:2), c(0:2,0:2,0:2)
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

    nodeloop: do iElem = 1, nSolve

      f(-1, 0, 0) = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 0) = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0,-1) = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 0) = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 0) = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 1) = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1,-1) = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 1) = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1,-1) = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 1) = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0,-1) = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0,-1) = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0, 1) = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 1) = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 0) = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 0) = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 0) = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 0) = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1,-1) = inState(?FETCH?( qNNN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 1) = inState(?FETCH?( qNN1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1,-1) = inState(?FETCH?( qN1N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 1) = inState(?FETCH?( qN11, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1,-1) = inState(?FETCH?( q1NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 1) = inState(?FETCH?( q1N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1,-1) = inState(?FETCH?( q11N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 1) = inState(?FETCH?( q111, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 0) = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))

      ! calculate rho and velocity ----------------------------------------------
      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      inv_rho = 1.0_rk / rho
      ! local x-, y- and z-velocity
      ux = auxField(elemOff + vel_pos(1))
      uy = auxField(elemOff + vel_pos(2))
      uz = auxField(elemOff + vel_pos(3))
      ! calculate rho and velocity ----------------------------------------------

      ! get all central moments
      ! eq 43 - 45
      do kk = 0,2
        do jj = 0,2
          do ii = 0,2
            k( ii, jj, kk ) = central_moment_split( f, ii, jj, kk, ux, uy, uz )
          end do
        end do
      end do

      omega = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)

      ! Collision --------------------------------------------------------------
      c(0,0,0) = k(0,0,0)
      c(1,0,0) = k(1,0,0)
      c(0,1,0) = k(0,1,0)
      c(0,0,1) = k(0,0,1)
      c(1,1,0) = ( 1.0_rk - omega ) * k(1,1,0)
      c(1,0,1) = ( 1.0_rk - omega ) * k(1,0,1)
      c(0,1,1) = ( 1.0_rk - omega ) * k(0,1,1)

      ! eq D.1 - D.3
      Dxu = -omega * 0.5_rk * inv_rho * ( 2.0_rk * k(2,0,0) - k(0,2,0) - k(0,0,2))&
        &   - 0.5_rk * inv_rho * ( k(2,0,0) + k(0,2,0) + k(0,0,2) - rho )
      Dyv = Dxu + 3.0_rk * omega * 0.5_rk * inv_rho * ( k(2,0,0) - k(0,2,0) )
      Dzw = Dxu + 3.0_rk * omega * 0.5_rk * inv_rho * ( k(2,0,0) - k(0,0,2) )

      ! eq D.4 - D.6
      ! first calculate the rights parts
      AA = ( 1.0_rk - omega ) * ( k(2,0,0) - k(0,2,0) ) &
        & - 3.0_rk * rho * ( 1.0_rk - omega*0.5_rk ) * ( ux*ux*Dxu - uy*uy*Dyv )
      BB = ( 1.0_rk - omega ) * ( k(2,0,0) - k(0,0,2) ) &
        & - 3.0_rk * rho * ( 1.0_rk - omega*0.5_rk ) * ( ux*ux*Dxu - uz*uz*Dzw )
      CC = rho - 1.5_rk * rho * ( ux*ux*Dxu + uy*uy*Dyv + uz*uz*Dzw )

      ! then solve the three moments
      c(2,0,0) = (AA+BB+CC) * div1_3
      c(0,2,0) = (BB+CC) * div1_3 - AA * div2_3
      c(0,0,2) = (AA+CC) * div1_3 - BB * div2_3

      ! for 3rd order or higher, set to 0
      c(2,1,0) = 0.0_rk ! 3rd order
      c(2,0,1) = 0.0_rk
      c(1,2,0) = 0.0_rk
      c(0,2,1) = 0.0_rk
      c(1,0,2) = 0.0_rk
      c(0,1,2) = 0.0_rk
      c(1,1,1) = 0.0_rk

      c(2,1,1) = 0.0_rk ! 4th order
      c(1,2,1) = 0.0_rk
      c(1,1,2) = 0.0_rk
      c(2,2,0) = rho * div1_3 * div1_3
      c(2,0,2) = rho * div1_3 * div1_3
      c(0,2,2) = rho * div1_3 * div1_3

      c(1,2,2) = 0.0_rk ! 5th order
      c(2,1,2) = 0.0_rk
      c(2,2,1) = 0.0_rk
      c(2,2,2) = rho * div1_3 * div1_3 * div1_3 ! 6th order
      ! Collision --------------------------------------------------------------

      ! Back to PDF ------------------------------------------------------------
      ff = cm_to_pdf( c, ux, uy, uz )
      ! Back to PDF ------------------------------------------------------------

      ! write to state array
outState( ?SAVE?( q000,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0,  0,  0)
outState( ?SAVE?( qN00,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1,  0,  0)
outState( ?SAVE?( q100,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1,  0,  0)
outState( ?SAVE?( q0N0,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0, -1,  0)
outState( ?SAVE?( q010,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0,  1,  0)
outState( ?SAVE?( q00N,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0,  0, -1)
outState( ?SAVE?( q001,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0,  0,  1)
outState( ?SAVE?( q0NN,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0, -1, -1)
outState( ?SAVE?( q01N,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0,  1, -1)
outState( ?SAVE?( q0N1,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0, -1,  1)
outState( ?SAVE?( q011,1,iElem,QQ,QQ,nElems,neigh )) = ff( 0,  1,  1)
outState( ?SAVE?( qN0N,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1,  0, -1)
outState( ?SAVE?( q10N,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1,  0, -1)
outState( ?SAVE?( qN01,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1,  0,  1)
outState( ?SAVE?( q101,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1,  0,  1)
outState( ?SAVE?( qNN0,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1, -1,  0)
outState( ?SAVE?( q1N0,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1, -1,  0)
outState( ?SAVE?( qN10,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1,  1,  0)
outState( ?SAVE?( q110,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1,  1,  0)
outState( ?SAVE?( q1NN,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1, -1, -1)
outState( ?SAVE?( q11N,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1,  1, -1)
outState( ?SAVE?( q1N1,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1, -1,  1)
outState( ?SAVE?( q111,1,iElem,QQ,QQ,nElems,neigh )) = ff( 1,  1,  1)
outState( ?SAVE?( qNNN,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1, -1, -1)
outState( ?SAVE?( qN1N,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1,  1, -1)
outState( ?SAVE?( qNN1,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1, -1,  1)
outState( ?SAVE?( qN11,1,iElem,QQ,QQ,nElems,neigh )) = ff(-1,  1,  1)

    end do nodeloop

  end subroutine cascaded_d3q27
! ****************************************************************************** !

! ****************************************************************************** !
  !> No comment yet!
  !!
  !! TODO add comment
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  ! via [[mus_scheme_type:compute]] function pointer.
  subroutine trt_advRel_d3q27( fieldProp, inState, outState, auxField, &
    &                          neigh, nElems, nSolve, level, layout,   &
    &                          params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem
    integer :: nScalars
    real(kind=rk) :: f(-1:1,-1:1,-1:1)
    real(kind=rk) :: u, v, w, u2, v2, w2
    real(kind=rk) :: rho, inv_rho!, rho_omg
    real(kind=rk) :: wP, wN, p_part, n_part
    real(kind=rk) :: X0, X1, XN
    real(kind=rk) :: Y0, Y1, YN
    real(kind=rk) :: Z0, Z1, ZN
    integer :: dens_pos, vel_pos(3), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:3)

    nScalars = varSys%nScalars

    !NEC$ ivdep
?? IF (SOA) THEN
?? copy :: dir_vector
?? ELSE
?? copy :: dir_novec
?? END IF
    nodeloop: do iElem = 1, nSolve

      f(-1, 0, 0) = inState(?FETCH?( qN00, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 0) = inState(?FETCH?( q0N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0,-1) = inState(?FETCH?( q00N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 0) = inState(?FETCH?( q100, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 0) = inState(?FETCH?( q010, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 1) = inState(?FETCH?( q001, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1,-1) = inState(?FETCH?( q0NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0,-1, 1) = inState(?FETCH?( q0N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1,-1) = inState(?FETCH?( q01N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 1, 1) = inState(?FETCH?( q011, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0,-1) = inState(?FETCH?( qN0N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0,-1) = inState(?FETCH?( q10N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 0, 1) = inState(?FETCH?( qN01, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 0, 1) = inState(?FETCH?( q101, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 0) = inState(?FETCH?( qNN0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 0) = inState(?FETCH?( qN10, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 0) = inState(?FETCH?( q1N0, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 0) = inState(?FETCH?( q110, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1,-1) = inState(?FETCH?( qNNN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1,-1, 1) = inState(?FETCH?( qNN1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1,-1) = inState(?FETCH?( qN1N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f(-1, 1, 1) = inState(?FETCH?( qN11, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1,-1) = inState(?FETCH?( q1NN, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1,-1, 1) = inState(?FETCH?( q1N1, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1,-1) = inState(?FETCH?( q11N, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 1, 1, 1) = inState(?FETCH?( q111, 1, iElem, QQ, nScalars, nElems,neigh ))
      f( 0, 0, 0) = inState(?FETCH?( q000, 1, iElem, QQ, nScalars, nElems,neigh ))


      ! calculate rho and velocity ----------------------------------------------
      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      inv_rho = 1.0_rk / rho
      ! local x-, y- and z-velocity
      u = auxField(elemOff + vel_pos(1))
      v = auxField(elemOff + vel_pos(2))
      w = auxField(elemOff + vel_pos(3))
      ! rho_omg = rho * omega
      ! calculate rho and velocity ----------------------------------------------

      ! u v square
      u2 = u*u
      v2 = v*v
      w2 = w*w

      ! X Y Z components of eq
      ! by equation A.19 - A.21
      X0 = -div2_3 + u2
      X1 = - ( X0 + 1.0_rk + u ) * 0.5_rk
      XN = X1 + u

      Y0 = -div2_3 + v2
      Y1 = - ( Y0 + 1.0_rk + v ) * 0.5_rk
      YN = Y1 + v

      Z0 = -div2_3 + w2
      Z1 = - ( Z0 + 1.0_rk + w ) * 0.5_rk
      ZN = Z1 + w

      ! X0 = -rho_omg * X0
      ! X1 = -rho_omg * X1
      ! XN = -rho_omg * XN

      wP = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      wN = 1.0_rk / ( 0.5_rk + fieldProp(1)%fluid%lambda &
        &                   / ( 1.0_rk/wP - 0.5_rk )     )

! fEq = -rho * X * Y * Z
! fEq000 = X0 * Y0 * Z0
outState( ?SAVE?( q000,1,iElem,QQ,QQ,nElems,neigh )) = (1.0_rk - wP)*f(0,0,0) &
  &                                                  - rho*wP*X0*Y0*Z0

p_part = wP * ( ( f(1,0,0) + f(-1,0,0) ) - (-rho*X1*Y0*Z0 - rho*XN*Y0*Z0) ) * div1_2
n_part = wN * ( ( f(1,0,0) - f(-1,0,0) ) - (-rho*X1*Y0*Z0 + rho*XN*Y0*Z0) ) * div1_2
outState( ?SAVE?( q100,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,0,0) - p_part - n_part
outState( ?SAVE?( qN00,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,0,0) - p_part + n_part

p_part = wP * ( ( f(0,1,0) + f(0,-1,0) ) - (-rho*X0*Y1*Z0 - rho*X0*YN*Z0) ) * div1_2
n_part = wN * ( ( f(0,1,0) - f(0,-1,0) ) - (-rho*X0*Y1*Z0 + rho*X0*YN*Z0) ) * div1_2
outState( ?SAVE?( q010,1,iElem,QQ,QQ,nElems,neigh )) = f(0, 1,0) - p_part - n_part
outState( ?SAVE?( q0N0,1,iElem,QQ,QQ,nElems,neigh )) = f(0,-1,0) - p_part + n_part

p_part = wP * ( ( f(0,0,1) + f(0,0,-1) ) - (-rho*X0*Y0*Z1 - rho*X0*Y0*ZN) ) * div1_2
n_part = wN * ( ( f(0,0,1) - f(0,0,-1) ) - (-rho*X0*Y0*Z1 + rho*X0*Y0*ZN) ) * div1_2
outState( ?SAVE?( q001,1,iElem,QQ,QQ,nElems,neigh )) = f(0,0, 1) - p_part - n_part
outState( ?SAVE?( q00N,1,iElem,QQ,QQ,nElems,neigh )) = f(0,0,-1) - p_part + n_part

p_part = wP * ( ( f(0,1,1) + f(0,-1,-1) ) - (-rho*X0*Y1*Z1 - rho*X0*YN*ZN) ) * div1_2
n_part = wN * ( ( f(0,1,1) - f(0,-1,-1) ) - (-rho*X0*Y1*Z1 + rho*X0*YN*ZN) ) * div1_2
outState( ?SAVE?( q011,1,iElem,QQ,QQ,nElems,neigh )) = f(0, 1, 1) - p_part - n_part
outState( ?SAVE?( q0NN,1,iElem,QQ,QQ,nElems,neigh )) = f(0,-1,-1) - p_part + n_part

p_part = wP * ( ( f(0,1,-1) + f(0,-1,1) ) - (-rho*X0*Y1*ZN - rho*X0*YN*Z1) ) * div1_2
n_part = wN * ( ( f(0,1,-1) - f(0,-1,1) ) - (-rho*X0*Y1*ZN + rho*X0*YN*Z1) ) * div1_2
outState( ?SAVE?( q01N,1,iElem,QQ,QQ,nElems,neigh )) = f(0, 1,-1) - p_part - n_part
outState( ?SAVE?( q0N1,1,iElem,QQ,QQ,nElems,neigh )) = f(0,-1, 1) - p_part + n_part

p_part = wP * ( ( f(1,0,1) + f(-1,0,-1) ) - (-rho*X1*Y0*Z1 - rho*XN*Y0*ZN) ) * div1_2
n_part = wN * ( ( f(1,0,1) - f(-1,0,-1) ) - (-rho*X1*Y0*Z1 + rho*XN*Y0*ZN) ) * div1_2
outState( ?SAVE?( q101,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,0, 1) - p_part - n_part
outState( ?SAVE?( qN0N,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,0,-1) - p_part + n_part

p_part = wP * ( ( f(1,0,-1) + f(-1,0,1) ) - (-rho*X1*Y0*ZN - rho*XN*Y0*Z1) ) * div1_2
n_part = wN * ( ( f(1,0,-1) - f(-1,0,1) ) - (-rho*X1*Y0*ZN + rho*XN*Y0*Z1) ) * div1_2
outState( ?SAVE?( q10N,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,0,-1) - p_part - n_part
outState( ?SAVE?( qN01,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,0, 1) - p_part + n_part

p_part = wP * ( ( f(1,1,0) + f(-1,-1,0) ) - (-rho*X1*Y1*Z0 - rho*XN*YN*Z0) ) * div1_2
n_part = wN * ( ( f(1,1,0) - f(-1,-1,0) ) - (-rho*X1*Y1*Z0 + rho*XN*YN*Z0) ) * div1_2
outState( ?SAVE?( q110,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 1,0) - p_part - n_part
outState( ?SAVE?( qNN0,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,-1,0) - p_part + n_part

p_part = wP * ( ( f(1,-1,0) + f(-1,1,0) ) - (-rho*X1*YN*Z0 - rho*XN*Y1*Z0) ) * div1_2
n_part = wN * ( ( f(1,-1,0) - f(-1,1,0) ) - (-rho*X1*YN*Z0 + rho*XN*Y1*Z0) ) * div1_2
outState( ?SAVE?( q1N0,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,-1,0) - p_part - n_part
outState( ?SAVE?( qN10,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 1,0) - p_part + n_part

p_part = wP * ( ( f(1,-1,-1) + f(-1,1,1) ) - (-rho*X1*YN*ZN - rho*XN*Y1*Z1) ) * div1_2
n_part = wN * ( ( f(1,-1,-1) - f(-1,1,1) ) - (-rho*X1*YN*ZN + rho*XN*Y1*Z1) ) * div1_2
outState( ?SAVE?( q1NN,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,-1,-1) - p_part - n_part
outState( ?SAVE?( qN11,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 1, 1) - p_part + n_part

p_part = wP * ( ( f(1,1,-1) + f(-1,-1,1) ) - (-rho*X1*Y1*ZN - rho*XN*YN*Z1) ) * div1_2
n_part = wN * ( ( f(1,1,-1) - f(-1,-1,1) ) - (-rho*X1*Y1*ZN + rho*XN*YN*Z1) ) * div1_2
outState( ?SAVE?( q11N,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 1,-1) - p_part - n_part
outState( ?SAVE?( qNN1,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,-1, 1) - p_part + n_part

p_part = wP * ( ( f(1,-1,1) + f(-1,1,-1) ) - (-rho*X1*YN*Z1 - rho*XN*Y1*ZN) ) * div1_2
n_part = wN * ( ( f(1,-1,1) - f(-1,1,-1) ) - (-rho*X1*YN*Z1 + rho*XN*Y1*ZN) ) * div1_2
outState( ?SAVE?( q1N1,1,iElem,QQ,QQ,nElems,neigh )) = f( 1,-1, 1) - p_part - n_part
outState( ?SAVE?( qN1N,1,iElem,QQ,QQ,nElems,neigh )) = f(-1, 1,-1) - p_part + n_part

p_part = wP * ( ( f(1,1,1) + f(-1,-1,-1) ) - (-rho*X1*Y1*Z1 - rho*XN*YN*ZN) ) * div1_2
n_part = wN * ( ( f(1,1,1) - f(-1,-1,-1) ) - (-rho*X1*Y1*Z1 + rho*XN*YN*ZN) ) * div1_2
outState( ?SAVE?( q111,1,iElem,QQ,QQ,nElems,neigh )) = f( 1, 1, 1) - p_part - n_part
outState( ?SAVE?( qNNN,1,iElem,QQ,QQ,nElems,neigh )) = f(-1,-1,-1) - p_part + n_part

    end do nodeloop

  end subroutine trt_advRel_d3q27
! ****************************************************************************** !

end module mus_d3q27_module
! ****************************************************************************** !
