! Copyright (c) 2011-2013 Manuel Hasert <m.hasert@grs-sim.de>
! Copyright (c) 2011-2013 Simon Zimny <s.zimny@grs-sim.de>
! Copyright (c) 2011, 2017, 2019 Harald Klimach <harald.klimach@uni-siegen.de>
! Copyright (c) 2011-2016, 2018-2020 Kannan Masilamani <kannan.masilamani@uni-siegen.de>
! Copyright (c) 2011 Jan Hueckelheim <j.hueckelheim@grs-sim.de>
! Copyright (c) 2012, 2014-2016 Jiaxing Qi <jiaxing.qi@uni-siegen.de>
! Copyright (c) 2012 Kartik Jain <kartik.jain@uni-siegen.de>
! Copyright (c) 2012 Sathish Krishnan P S <s.krishnan@grs-sim.de>
! Copyright (c) 2016 Tobias Schneider <tobias1.schneider@student.uni-siegen.de>
! Copyright (c) 2020 Peter Vitt <peter.vitt2@uni-siegen.de>
!
! Redistribution and use in source and binary forms, with or without
! modification, are permitted provided that the following conditions are met:
!
! 1. Redistributions of source code must retain the above copyright notice,
! this list of conditions and the following disclaimer.
!
! 2. Redistributions in binary form must reproduce the above copyright notice,
! this list of conditions and the following disclaimer in the documentation
! and/or other materials provided with the distribution.
!
! THIS SOFTWARE IS PROVIDED BY THE UNIVERSITY OF SIEGEN “AS IS” AND ANY EXPRESS
! OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
! OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
! IN NO EVENT SHALL UNIVERSITY OF SIEGEN OR CONTRIBUTORS BE LIABLE FOR ANY
! DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
! (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
! LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
! ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
! (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
! SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
! ****************************************************************************** !
!> author: Jiaxing Qi
!! Routines and parameter definitions for the standard D2Q9 model
!!
?? include 'header/lbm_macros.inc'
?? include 'header/lbm_d2q9Macros.inc'
module mus_d2q9_module
  use iso_c_binding, only: c_f_pointer

  ! include treelm modules
  use env_module,            only: rk
  use tem_varSys_module,     only: tem_varSys_type, tem_varSys_op_type
  use tem_param_module,      only: cs2inv, t2cs2inv, t2cs4inv, div1_12, &
    &                              div1_4, div1_9, div1_6, div1_18, div1_36, &
    &                              div1_2, div4_9, div2_9, div1_3, div2_3, rho0
  use tem_aux_module,        only: tem_abort

  ! include musubi modules
  use mus_field_prop_module,    only: mus_field_prop_type
  use mus_scheme_layout_module, only: mus_scheme_layout_type
  use mus_derVarPos_module,     only: mus_derVarPos_type
  use mus_param_module,         only: mus_param_type
  use mus_varSys_module,        only: mus_varSys_data_type
  use mus_mrtRelaxation_module, only: mrt_d2q9

  implicit none

  private

  public :: bgk_improved_advRel_d2q9
  public :: bgk_advRel_d2q9
  public :: mrt_advRel_d2q9
  public :: mrt_advRel_d2q9_incomp
  public :: bgk_advRel_d2q9_incomp

  ! =============================================================================
  ! D2Q9 flow model
  ! =============================================================================
  !> Definition of the discrete velocity set
  integer, parameter :: QQ   = 9   !< number of pdf directions

  integer, parameter :: q__W = 1     !< west             x-
  integer, parameter :: q__S = 2     !< south            y-
  integer, parameter :: q__E = 3     !< east             x+
  integer, parameter :: q__N = 4     !< north            y+
  integer, parameter :: q_SW = 5     !<                  y-,x-
  integer, parameter :: q_NW = 6     !<                  y+,x-
  integer, parameter :: q_SE = 7     !<                  y-,x+
  integer, parameter :: q_NE = 8     !<                  y+,x+
  integer, parameter :: q__0 = 9     !< rest density is last

  ! General direction index
  integer,parameter :: qN0 = 1  !< west             x-
  integer,parameter :: q0N = 2  !< south            y-
  integer,parameter :: q10 = 3  !< east             x+
  integer,parameter :: q01 = 4  !< north            y+
  integer,parameter :: qNN = 5  !<                  y-,x-
  integer,parameter :: qN1 = 6  !<                  y+,x-
  integer,parameter :: q1N = 7  !<                  y-,x+
  integer,parameter :: q11 = 8  !<                  y+,x+
  integer,parameter :: q00 = 9  !< rest

  ! D2Q9 MRT pdf -> moment transformation matrix
  ! How to use:
  ! do iDir = 1, QQ
  !   moment(iDir) = sum( PDF(:) * MMtrD2Q9(:,iDir) )
  ! end do
  !     W          S          E          N         SW         NW         SE         NE          0
  real(kind=rk), dimension(9,9),parameter  :: MMtrD2Q9 = &
  reshape((/ &
    1._rk,     1._rk,     1._rk,     1._rk,     1._rk,     1._rk,     1._rk,     1._rk,     1._rk,& !
   -1._rk,    -1._rk,    -1._rk,    -1._rk,     2._rk,     2._rk,     2._rk,     2._rk,    -4._rk,& !mom2
   -2._rk,    -2._rk,    -2._rk,    -2._rk,     1._rk,     1._rk,     1._rk,     1._rk,     4._rk,& !mom3
   -1._rk,     0._rk,     1._rk,     0._rk,    -1._rk,    -1._rk,     1._rk,     1._rk,     0._rk,&
    2._rk,     0._rk,    -2._rk,     0._rk,    -1._rk,    -1._rk,     1._rk,     1._rk,     0._rk,& !mom5
    0._rk,    -1._rk,     0._rk,     1._rk,    -1._rk,     1._rk,    -1._rk,     1._rk,     0._rk,&
    0._rk,     2._rk,     0._rk,    -2._rk,    -1._rk,     1._rk,    -1._rk,     1._rk,     0._rk,& !mom7
    1._rk,    -1._rk,     1._rk,    -1._rk,     0._rk,     0._rk,     0._rk,     0._rk,     0._rk,& !mom8
    0._rk,     0._rk,     0._rk,     0._rk,     1._rk,    -1._rk,    -1._rk,     1._rk,     0._rk & !mom9
   /),(/9,9/))

  ! D2Q9 MRT moment -> pdf transformation matrix
  ! How to use:
  ! do iDir = 1, QQ
  !   pdf(iDir) = sum( moment(:) * MMIvD2Q9(iDir,:) )
  ! end do
  real(kind=rk), dimension(9,9), parameter  :: MMIvD2Q9 =  &
            reshape((/ &
!     1         2         3        4         5         6         7        8        9
 div1_9, -div1_36, -div1_18, -div1_6,   div1_6,    0._rk,    0._rk,  div1_4,   0._rk,  & ! W
 div1_9, -div1_36, -div1_18,   0._rk,    0._rk,  -div1_6,   div1_6, -div1_4,   0._rk,  & ! S
 div1_9, -div1_36, -div1_18,  div1_6,  -div1_6,    0._rk,    0._rk,  div1_4,   0._rk,  & ! E
 div1_9, -div1_36, -div1_18,   0._rk,    0._rk,   div1_6,  -div1_6, -div1_4,   0._rk,  & ! N
 div1_9,  div1_18,  div1_36, -div1_6, -div1_12,  -div1_6, -div1_12,   0._rk,  div1_4,  & ! SW
 div1_9,  div1_18,  div1_36, -div1_6, -div1_12,   div1_6,  div1_12,   0._rk, -div1_4,  & ! NW
 div1_9,  div1_18,  div1_36,  div1_6,  div1_12,  -div1_6, -div1_12,   0._rk, -div1_4,  & ! SE
 div1_9,  div1_18,  div1_36,  div1_6,  div1_12,   div1_6,  div1_12,   0._rk,  div1_4,  & ! NE
 div1_9,  -div1_9,   div1_9,   0._rk,    0._rk,    0._rk,    0._rk,   0._rk,   0._rk   & ! 0
   /),(/9,9/), order=(/2,1/))


contains

! ****************************************************************************** !
  !> Improved BGK model (with Galilean correction term)
  !! taken from Martin Geier cumulent paper 2015
  !! Geier, M., Schönherr, M., Pasquali, A., & Krafczyk, M. (2015).
  !! The cumulant lattice Boltzmann equation in three dimensions : Theory and
  !! validation. Computers and Mathematics with Applications.
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type;compute]] function pointer.
  subroutine bgk_improved_advRel_d2q9( fieldProp, inState, outState, auxField, &
    &                                  neigh, nElems, nSolve, level, layout,   &
    &                                  params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer       :: iElem
    real(kind=rk) :: f(-1:1,-1:1)
    real(kind=rk) :: u,v,u2,v2
    real(kind=rk) :: rho, rho_omg
    real(kind=rk) :: inv_rho
    real(kind=rk) :: omega, cmpl_o, fac
    real(kind=rk) :: m20, m02
    real(kind=rk) :: sumX1, sumXN, sumY1, sumYN
    real(kind=rk) :: Gx, Gy
    real(kind=rk) :: x0, x1, xn, y0, y1, yn
    integer :: dens_pos, vel_pos(2), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:2)


!$omp do schedule(static)
    !NEC$ ivdep
    !DIR$ NOVECTOR
    nodeloop: do iElem = 1, nSolve
      f(-1, 0) = inState( ?FETCH?( qN0, 1, iElem, QQ, QQ, nElems,neigh))
      f( 0,-1) = inState( ?FETCH?( q0N, 1, iElem, QQ, QQ, nElems,neigh))
      f( 1, 0) = inState( ?FETCH?( q10, 1, iElem, QQ, QQ, nElems,neigh))
      f( 0, 1) = inState( ?FETCH?( q01, 1, iElem, QQ, QQ, nElems,neigh))
      f(-1,-1) = inState( ?FETCH?( qNN, 1, iElem, QQ, QQ, nElems,neigh))
      f(-1, 1) = inState( ?FETCH?( qN1, 1, iElem, QQ, QQ, nElems,neigh))
      f( 1,-1) = inState( ?FETCH?( q1N, 1, iElem, QQ, QQ, nElems,neigh))
      f( 1, 1) = inState( ?FETCH?( q11, 1, iElem, QQ, QQ, nElems,neigh))
      f( 0, 0) = inState( ?FETCH?( q00, 1, iElem, QQ, QQ, nElems,neigh))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u = auxField(elemOff + vel_pos(1))
      v = auxField(elemOff + vel_pos(2))

      ! u v square
      u2 = u*u
      v2 = v*v

      sumX1 = sum(f( 1,:))
      sumXN = sum(f(-1,:))

      sumY1 = sum(f(:, 1))
      sumYN = sum(f(:,-1))

      inv_rho = 1.0_rk / rho
      ! second moments, by equation A.7 and A.8
      m20 = ( sumX1 + sumXN ) * inv_rho
      m02 = ( sumY1 + sumYN ) * inv_rho

      ! relaxation rate
      omega  = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      cmpl_o = 1._rk - omega
      fac    = 4.5_rk - 2.25_rk * omega

      ! calculate Galilean correction term, by equation A.13 and A.14
      Gx = fac * u2 * ( m20 - div1_3 - u2 )
      Gy = fac * v2 * ( m02 - div1_3 - v2 )

      ! X Y components of eq
      ! by equation A.19 - A.21
      X0 = -div2_3 + u2 + Gx
      X1 = - ( X0 + 1.0_rk + u ) * 0.5_rk
      XN = X1 + u

      Y0 = -div2_3 + v2 + Gy
      Y1 = - ( Y0 + 1.0_rk + v ) * 0.5_rk
      YN = Y1 + v

      rho_omg = rho * omega
      X0 = X0 * rho_omg
      X1 = X1 * rho_omg
      XN = XN * rho_omg

      outState( ?SAVE?( q00,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(0,0) + X0*Y0
      outState( ?SAVE?( qN0,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1,0) + XN*Y0
      outState( ?SAVE?( q10,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(1,0) + X1*Y0
      outState( ?SAVE?( q0N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(0,-1) + X0*YN
      outState( ?SAVE?( q01,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(0,1) + X0*Y1
      outState( ?SAVE?( qNN,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1,-1) + XN*YN
      outState( ?SAVE?( q11,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(1,1) + X1*Y1
      outState( ?SAVE?( q1N,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(1,-1) + X1*YN
      outState( ?SAVE?( qN1,1,iElem,QQ,QQ,nElems,neigh )) = cmpl_o*f(-1,1) + XN*Y1

    end do nodeloop
!$omp end do nowait

  end subroutine bgk_improved_advRel_d2q9
! ****************************************************************************** !

! ****************************************************************************** !
  !> No comment yet!
  !!
  !! TODO add comment
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d2q9( fieldProp, inState, outState, auxField, &
    &                         neigh, nElems, nSolve, level, layout,   &
    &                         params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem
    ! temporary distribution variables
?? copy :: var9( f )
?? copy :: var9( fEq )
    real(kind=rk) :: u_x, u_y
    real(kind=rk) :: rho, usq, ucx
    real(kind=rk) :: omega, cmpl_o
    real(kind=rk) :: c0, c1, c2, c3
    integer :: dens_pos, vel_pos(2), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:2)

!$omp do schedule(static)
    !NEC$ ivdep
    !DIR$ NOVECTOR
    nodeloop: do iElem = 1, nSolve

      f1 = inState( ?FETCH?( 1, 1, iElem, 9, 9, nElems,neigh))
      f2 = inState( ?FETCH?( 2, 1, iElem, 9, 9, nElems,neigh))
      f3 = inState( ?FETCH?( 3, 1, iElem, 9, 9, nElems,neigh))
      f4 = inState( ?FETCH?( 4, 1, iElem, 9, 9, nElems,neigh))
      f5 = inState( ?FETCH?( 5, 1, iElem, 9, 9, nElems,neigh))
      f6 = inState( ?FETCH?( 6, 1, iElem, 9, 9, nElems,neigh))
      f7 = inState( ?FETCH?( 7, 1, iElem, 9, 9, nElems,neigh))
      f8 = inState( ?FETCH?( 8, 1, iElem, 9, 9, nElems,neigh))
      f9 = inState( ?FETCH?( 9, 1, iElem, 9, 9, nElems,neigh))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))

      ! calculate fEq
?? copy :: fEq_d2q9_s(fEq, u_x, u_y, rho, rho, usq, ucx, c0, c1, c2, c3)

      omega  = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      cmpl_o = 1._rk - omega

      outState( ?SAVE?( 9,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f9 + omega*fEq9
      outState( ?SAVE?( 1,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f1 + omega*fEq1
      outState( ?SAVE?( 2,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f2 + omega*fEq2
      outState( ?SAVE?( 3,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f3 + omega*fEq3
      outState( ?SAVE?( 4,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f4 + omega*fEq4
      outState( ?SAVE?( 5,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f5 + omega*fEq5
      outState( ?SAVE?( 6,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f6 + omega*fEq6
      outState( ?SAVE?( 7,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f7 + omega*fEq7
      outState( ?SAVE?( 8,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f8 + omega*fEq8

    end do nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d2q9
! ****************************************************************************** !

! ****************************************************************************** !
  !> Advection relaxation routine for the D2Q9 MRT model
  !! f( x+c, t+1 ) = f(x,t) - M^(-1)S( m - meq )
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine mrt_advRel_d2q9( fieldProp, inState, outState, auxField, &
    &                         neigh, nElems, nSolve, level, layout,   &
    &                         params, varSys, derVarPos )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer       :: iElem
?? copy :: var9( f )
?? copy :: var9( fNeq )
    real(kind=rk) :: mom2, mom3, mom5, mom7, mom8, mom9
    real(kind=rk) :: meq2, meq3, meq8, meq9
    real(kind=rk) :: mneq2, mneq3, mneq5, mneq7, mneq8, mneq9
    real(kind=rk) :: s_mrt(9)
    real(kind=rk) :: rho ! local density
    real(kind=rk) :: ux, uy, u2, v2
    integer :: dens_pos, vel_pos(2), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:2)

    ! overwrite omegaKine indices 8 and 9 in element loop
    s_mrt = mrt_d2q9(omegaKine=1.0_rk, omegaBulk=1.0_rk,  QQ=9)

!$omp do schedule(static)
    !NEC$ ivdep
    !DIR$ NOVECTOR
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      f1 = inState( ?FETCH?( 1, 1, iElem, QQ, QQ, nElems,neigh))
      f2 = inState( ?FETCH?( 2, 1, iElem, QQ, QQ, nElems,neigh))
      f3 = inState( ?FETCH?( 3, 1, iElem, QQ, QQ, nElems,neigh))
      f4 = inState( ?FETCH?( 4, 1, iElem, QQ, QQ, nElems,neigh))
      f5 = inState( ?FETCH?( 5, 1, iElem, QQ, QQ, nElems,neigh))
      f6 = inState( ?FETCH?( 6, 1, iElem, QQ, QQ, nElems,neigh))
      f7 = inState( ?FETCH?( 7, 1, iElem, QQ, QQ, nElems,neigh))
      f8 = inState( ?FETCH?( 8, 1, iElem, QQ, QQ, nElems,neigh))
      f9 = inState( ?FETCH?( 9, 1, iElem, QQ, QQ, nElems,neigh))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      ux = auxField(elemOff + vel_pos(1))
      uy = auxField(elemOff + vel_pos(2))

      ! square of velocity
      u2 = ux * ux
      v2 = uy * uy
      ! meq1 =  rho ! rho
      meq2 =  -2._rk*rho + 3._rk*rho*(u2+v2) ! e
      meq3 =  rho - 3._rk*rho*(u2+v2) ! eps
      ! meq4 =  ux ! jx
      ! meq5 = -ux ! qx
      ! meq6 =  uy ! jy
      ! meq7 = -uy ! qy
      meq8 =  rho*(u2 - v2) ! pxx
      meq9 =  rho*ux*uy ! pxy

      ! convert pdf into moment
      mom2 = - f1 - f2 - f3 - f4 + 2.0_rk * ( f5 + f6 + f7 + f8 ) - 4.0_rk * f9
      mom3 = -2.0_rk * (f1 + f2 + f3 + f4) + f5 + f6 + f7 + f8 + 4.0_rk * f9
      mom5 = 2.0_rk * ( f1 - f3 ) - f5 - f6 + f7 + f8
      mom7 = 2.0_rk * ( f2 - f4 ) - f5 + f6 - f7 + f8
      mom8 = f1 - f2 + f3 - f4
      mom9 = f5 - f6 - f7 + f8

      ! omega
      s_mrt(8) = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      s_mrt(9) = s_mrt(8)

      ! compute neq moment
      ! mneq1 = s_mrt1 * ( mom1 - meq1 ) = 0
      mneq2 = s_mrt(2) * ( mom2 - meq2 )
      mneq3 = s_mrt(3) * ( mom3 - meq3 )
      ! mneq4 = s_mrt4 * ( mom4 - meq4 ) = 0
      mneq5 = s_mrt(5) * ( mom5 + rho*ux ) ! meq5 = -ux
      ! mneq6 = s_mrt6 * ( mom6 - meq6 ) = 0
      mneq7 = s_mrt(7) * ( mom7 + rho*uy ) ! meq7 = -uy
      mneq8 = s_mrt(8) * ( mom8 - meq8 )
      mneq9 = s_mrt(9) * ( mom9 - meq9 )

      ! compute fNeq
      ! do iDir = 1, 9
      !   fneq(iDir) = sum( MMIvD2Q9(iDir,:) * mneq(:) )
      ! end do
      ! mneq1 = mneq4 = mneq6 = 0
      fNeq1 = - div1_36*mneq2 - div1_18*mneq3 + div1_6*mneq5 + div1_4*mneq8
      fNeq2 = - div1_36*mneq2 - div1_18*mneq3 + div1_6*mneq7 - div1_4*mneq8
      fNeq3 = - div1_36*mneq2 - div1_18*mneq3 - div1_6*mneq5 + div1_4*mneq8
      fNeq4 = - div1_36*mneq2 - div1_18*mneq3 - div1_6*mneq7 - div1_4*mneq8
      fNeq5 = + div1_18*mneq2 + div1_36*mneq3 - div1_12*(mneq5+mneq7) + div1_4*mneq9
      fNeq6 = + div1_18*mneq2 + div1_36*mneq3 - div1_12*(mneq5-mneq7) - div1_4*mneq9
      fNeq7 = + div1_18*mneq2 + div1_36*mneq3 + div1_12*(mneq5-mneq7) - div1_4*mneq9
      fNeq8 = + div1_18*mneq2 + div1_36*mneq3 + div1_12*(mneq5+mneq7) + div1_4*mneq9
      fNeq9 = div1_9*(-mneq2+mneq3)

      outState( ?SAVE?( 1,1, iElem, 9, 9,nElems,neigh )) = f1 - fNeq1
      outState( ?SAVE?( 2,1, iElem, 9, 9,nElems,neigh )) = f2 - fNeq2
      outState( ?SAVE?( 3,1, iElem, 9, 9,nElems,neigh )) = f3 - fNeq3
      outState( ?SAVE?( 4,1, iElem, 9, 9,nElems,neigh )) = f4 - fNeq4
      outState( ?SAVE?( 5,1, iElem, 9, 9,nElems,neigh )) = f5 - fNeq5
      outState( ?SAVE?( 6,1, iElem, 9, 9,nElems,neigh )) = f6 - fNeq6
      outState( ?SAVE?( 7,1, iElem, 9, 9,nElems,neigh )) = f7 - fNeq7
      outState( ?SAVE?( 8,1, iElem, 9, 9,nElems,neigh )) = f8 - fNeq8
      outState( ?SAVE?( 9,1, iElem, 9, 9,nElems,neigh )) = f9 - fNeq9

    enddo nodeloop
!$omp end do nowait

  end subroutine mrt_advRel_d2q9
! ****************************************************************************** !

! ****************************************************************************** !
  !> Advection relaxation routine for the D2Q9 MRT model
  !! f( x+c, t+1 ) = f(x,t) - M^(-1)S( m - meq )
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine mrt_advRel_d2q9_incomp( fieldProp, inState, outState, auxField, &
    &                                neigh, nElems, nSolve, level, layout,   &
    &                                params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer       :: iElem
?? copy :: var9( f )
?? copy :: var9( fNeq )
    real(kind=rk) :: mom2, mom3, mom5, mom7, mom8, mom9
    real(kind=rk) :: meq2, meq3, meq8, meq9
    real(kind=rk) :: mneq2, mneq3, mneq5, mneq7, mneq8, mneq9
    real(kind=rk) :: s_mrt(9)
    real(kind=rk) :: rho ! local density
    real(kind=rk) :: ux, uy, u2, v2
    integer :: dens_pos, vel_pos(2), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:2)

    ! overwrite omegaKine indices 8 and 9 in element loop
    s_mrt = mrt_d2q9(omegaKine=1.0_rk, omegaBulk=1.0_rk,  QQ=9)

!$omp do schedule(static)
    !NEC$ ivdep
    !DIR$ NOVECTOR
    nodeloop: do iElem = 1, nSolve
      ! First load all local values into temp array
      f1 = inState( ?FETCH?( 1, 1, iElem, QQ, QQ, nElems,neigh))
      f2 = inState( ?FETCH?( 2, 1, iElem, QQ, QQ, nElems,neigh))
      f3 = inState( ?FETCH?( 3, 1, iElem, QQ, QQ, nElems,neigh))
      f4 = inState( ?FETCH?( 4, 1, iElem, QQ, QQ, nElems,neigh))
      f5 = inState( ?FETCH?( 5, 1, iElem, QQ, QQ, nElems,neigh))
      f6 = inState( ?FETCH?( 6, 1, iElem, QQ, QQ, nElems,neigh))
      f7 = inState( ?FETCH?( 7, 1, iElem, QQ, QQ, nElems,neigh))
      f8 = inState( ?FETCH?( 8, 1, iElem, QQ, QQ, nElems,neigh))
      f9 = inState( ?FETCH?( 9, 1, iElem, QQ, QQ, nElems,neigh))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      ux = auxField(elemOff + vel_pos(1))
      uy = auxField(elemOff + vel_pos(2))

      u2 = ux * ux
      v2 = uy * uy
      ! meq1 =  rho ! rho
      meq2 =  -2._rk*rho + 3._rk*rho0*(u2+v2) ! e
      meq3 =  rho - 3._rk*rho0*(u2+v2) ! eps
      ! meq4 =  ux ! jx
      ! meq5 = -ux ! qx
      ! meq6 =  uy ! jy
      ! meq7 = -uy ! qy
      meq8 =  rho0*(u2 - v2) ! pxx
      meq9 =  rho0*ux*uy ! pxy

      ! convert pdf into moment
      mom2 = - f1 - f2 - f3 - f4 + 2.0_rk * ( f5 + f6 + f7 + f8 ) - 4.0_rk * f9
      mom3 = -2.0_rk * (f1 + f2 + f3 + f4) + f5 + f6 + f7 + f8 + 4.0_rk * f9
      mom5 = 2.0_rk * ( f1 - f3 ) - f5 - f6 + f7 + f8
      mom7 = 2.0_rk * ( f2 - f4 ) - f5 + f6 - f7 + f8
      mom8 = f1 - f2 + f3 - f4
      mom9 = f5 - f6 - f7 + f8

      ! omega
      s_mrt(8) = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      s_mrt(9) = s_mrt(8)

      ! compute neq moment
      ! mneq1 = s_mrt1 * ( mom1 - meq1 ) = 0
      mneq2 = s_mrt(2) * ( mom2 - meq2 )
      mneq3 = s_mrt(3) * ( mom3 - meq3 )
      ! mneq4 = s_mrt4 * ( mom4 - meq4 ) = 0
      mneq5 = s_mrt(5) * ( mom5 + rho0*ux ) ! meq5 = -ux
      ! mneq6 = s_mrt6 * ( mom6 - meq6 ) = 0
      mneq7 = s_mrt(7) * ( mom7 + rho0*uy ) ! meq7 = -uy
      mneq8 = s_mrt(8) * ( mom8 - meq8 )
      mneq9 = s_mrt(9) * ( mom9 - meq9 )

      ! compute fNeq
      ! do iDir = 1, 9
      !   fneq(iDir) = sum( MMIvD2Q9(iDir,:) * mneq(:) )
      ! end do
      ! mneq1 = mneq4 = mneq6 = 0
      fNeq1 = - div1_36*mneq2 - div1_18*mneq3 + div1_6*mneq5 + div1_4*mneq8
      fNeq2 = - div1_36*mneq2 - div1_18*mneq3 + div1_6*mneq7 - div1_4*mneq8
      fNeq3 = - div1_36*mneq2 - div1_18*mneq3 - div1_6*mneq5 + div1_4*mneq8
      fNeq4 = - div1_36*mneq2 - div1_18*mneq3 - div1_6*mneq7 - div1_4*mneq8
      fNeq5 = + div1_18*mneq2 + div1_36*mneq3 - div1_12*(mneq5+mneq7) + div1_4*mneq9
      fNeq6 = + div1_18*mneq2 + div1_36*mneq3 - div1_12*(mneq5-mneq7) - div1_4*mneq9
      fNeq7 = + div1_18*mneq2 + div1_36*mneq3 + div1_12*(mneq5-mneq7) - div1_4*mneq9
      fNeq8 = + div1_18*mneq2 + div1_36*mneq3 + div1_12*(mneq5+mneq7) + div1_4*mneq9
      fNeq9 = div1_9*(-mneq2+mneq3)

      outState( ?SAVE?( 1,1, iElem, 9, 9,nElems,neigh )) = f1 - fNeq1
      outState( ?SAVE?( 2,1, iElem, 9, 9,nElems,neigh )) = f2 - fNeq2
      outState( ?SAVE?( 3,1, iElem, 9, 9,nElems,neigh )) = f3 - fNeq3
      outState( ?SAVE?( 4,1, iElem, 9, 9,nElems,neigh )) = f4 - fNeq4
      outState( ?SAVE?( 5,1, iElem, 9, 9,nElems,neigh )) = f5 - fNeq5
      outState( ?SAVE?( 6,1, iElem, 9, 9,nElems,neigh )) = f6 - fNeq6
      outState( ?SAVE?( 7,1, iElem, 9, 9,nElems,neigh )) = f7 - fNeq7
      outState( ?SAVE?( 8,1, iElem, 9, 9,nElems,neigh )) = f8 - fNeq8
      outState( ?SAVE?( 9,1, iElem, 9, 9,nElems,neigh )) = f9 - fNeq9

    enddo nodeloop
!$omp end do nowait

  end subroutine mrt_advRel_d2q9_incomp
! ****************************************************************************** !

! ****************************************************************************** !
  !> No comment yet!
  !!
  !! TODO add comment
  !!
  !! This subroutine interface must match the abstract interface definition
  !! [[kernel]] in scheme/[[mus_scheme_type_module]].f90 in order to be callable
  !! via [[mus_scheme_type:compute]] function pointer.
  subroutine bgk_advRel_d2q9_incomp( fieldProp, inState, outState, auxField, &
    &                                neigh, nElems, nSolve, level, layout,   &
    &                                params, varSys, derVarPos               )
    ! -------------------------------------------------------------------- !
    !> Array of field properties (fluid or species)
    type(mus_field_prop_type), intent(in) :: fieldProp(:)
    !> variable system definition
    type(tem_varSys_type), intent(in) :: varSys
    !> current layout
    type(mus_scheme_layout_type), intent(in) :: layout
    !> number of elements in state Array
    integer, intent(in) :: nElems
    !> input  pdf vector
    real(kind=rk), intent(in)  ::  inState(nElems * varSys%nScalars)
    !> output pdf vector
    real(kind=rk), intent(out) :: outState(nElems * varSys%nScalars)
    !> Auxiliary field computed from pre-collision state
    !! Is updated with correct velocity field for multicomponent models
    real(kind=rk), intent(inout) :: auxField(nElems * varSys%nAuxScalars)
    !> connectivity vector
    integer, intent(in) :: neigh(nElems * layout%fStencil%QQ)
    !> number of elements solved in kernel
    integer, intent(in) :: nSolve
    !> current level
    integer,intent(in) :: level
    !> global parameters
    type(mus_param_type),intent(in) :: params
    !> position of derived quantities in varsys for all fields
    type( mus_derVarPos_type ), intent(in) :: derVarPos(:)
    ! -------------------------------------------------------------------- !
    integer :: iElem
    ! temporary distribution variables
?? copy :: var9( f )
?? copy :: var9( fEq )
    real(kind=rk) :: u_x, u_y
    real(kind=rk) :: rho, usq, ucx
    real(kind=rk) :: omega, cmpl_o
    real(kind=rk) :: c0, c1, c2, c3
    integer :: dens_pos, vel_pos(2), elemOff
    ! ---------------------------------------------------------------------------
    dens_pos = varSys%method%val(derVarPos(1)%density)%auxField_varPos(1)
    vel_pos = varSys%method%val(derVarPos(1)%velocity)%auxField_varPos(1:2)

!$omp do schedule(static)
    !NEC$ ivdep
    !DIR$ NOVECTOR
    nodeloop: do iElem = 1, nSolve

      f1 = inState( ?FETCH?( 1, 1, iElem, 9, 9, nElems,neigh))
      f2 = inState( ?FETCH?( 2, 1, iElem, 9, 9, nElems,neigh))
      f3 = inState( ?FETCH?( 3, 1, iElem, 9, 9, nElems,neigh))
      f4 = inState( ?FETCH?( 4, 1, iElem, 9, 9, nElems,neigh))
      f5 = inState( ?FETCH?( 5, 1, iElem, 9, 9, nElems,neigh))
      f6 = inState( ?FETCH?( 6, 1, iElem, 9, 9, nElems,neigh))
      f7 = inState( ?FETCH?( 7, 1, iElem, 9, 9, nElems,neigh))
      f8 = inState( ?FETCH?( 8, 1, iElem, 9, 9, nElems,neigh))
      f9 = inState( ?FETCH?( 9, 1, iElem, 9, 9, nElems,neigh))

      ! element offset for auxField array
      elemOff = (iElem-1)*varSys%nAuxScalars
      ! local density
      rho = auxField(elemOff + dens_pos)
      ! local x-, y- and z-velocity
      u_x = auxField(elemOff + vel_pos(1))
      u_y = auxField(elemOff + vel_pos(2))

      ! calculate fEq
?? copy :: fEq_d2q9_s(fEq, u_x, u_y, rho, rho0, usq, ucx, c0, c1, c2, c3)

      omega  = fieldProp(1)%fluid%viscKine%omLvl(level)%val(iElem)
      cmpl_o = 1._rk - omega

      outState( ?SAVE?( 9,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f9 + omega*fEq9
      outState( ?SAVE?( 1,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f1 + omega*fEq1
      outState( ?SAVE?( 2,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f2 + omega*fEq2
      outState( ?SAVE?( 3,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f3 + omega*fEq3
      outState( ?SAVE?( 4,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f4 + omega*fEq4
      outState( ?SAVE?( 5,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f5 + omega*fEq5
      outState( ?SAVE?( 6,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f6 + omega*fEq6
      outState( ?SAVE?( 7,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f7 + omega*fEq7
      outState( ?SAVE?( 8,1, iElem, 9, 9,nElems,neigh )) = cmpl_o*f8 + omega*fEq8

    end do nodeloop
!$omp end do nowait

  end subroutine bgk_advRel_d2q9_incomp
! ****************************************************************************** !

end module mus_d2q9_module
! ****************************************************************************** !
