!WRF:MODEL_LAYER:DYNAMICS
!

#ifdef STANDALONE
#  define TENTIN
#  define TENTINOUT
#  define TENTOUT
#  define OPTY
#else
#  define TENTIN      ,INTENT(IN)
#  define TENTINOUT   ,INTENT(INOUT)
#  define TENTOUT     ,INTENT(OUT)
#  define OPTY   ,OPTIONAL
#endif

#ifdef PINNING
! from http://developer.download.nvidia.com/compute/cuda/1_1/Fortran_Cuda_Blas.tgz
module cuda_alloc
 interface 
! cudaMallocHost
 integer (C_INT) function cudaMallocHost(buffer, size)  bind(C,name="cudaMallocHost")
  use iso_c_binding
  implicit none
  type (C_PTR)  :: buffer
  integer (C_SIZE_T), value :: size
 end function cudaMallocHost
! cudaFreeHost
 integer (C_INT) function cudaFreeHost(buffer)  bind(C,name="cudaFreeHost")
  use iso_c_binding
  implicit none
  type (C_PTR), value :: buffer
 end function cudaFreeHost
 end interface 
end module cuda_alloc
#endif

MODULE module_em

!!!USE module_model_constants
!!!USE module_advect_em
#ifndef RUN_ON_GPU
USE module_advect_em_test
#endif
!!!USE module_big_step_utilities_em
!!!USE module_state_description
!!!USE module_damping_em

CONTAINS

! imported from module_big_step_utilities
SUBROUTINE zero_tend ( tendency,                     &
                       ids, ide, jds, jde, kds, kde, &
                       ims, ime, jms, jme, kms, kme, &
                       its, ite, jts, jte, kts, kte )


   IMPLICIT NONE

   ! Input data

   INTEGER ,                                   INTENT(IN   ) :: ids, ide, jds, jde, kds, kde, &
                                                                ims, ime, jms, jme, kms, kme, &
                                                                its, ite, jts, jte, kts, kte

   REAL , DIMENSION( ims:ime , kms:kme , jms:jme ) , INTENT(INOUT) :: tendency

   ! Local data

   INTEGER :: i, j, k, itf, jtf, ktf

!<DESCRIPTION>
!
!  zero_tend sets the input tendency array to zero.
!
!</DESCRIPTION>

      DO j = jts, jte
      DO k = kts, kte
      DO i = its, ite
        tendency(i,k,j) = 0.
      ENDDO
      ENDDO
      ENDDO

END SUBROUTINE zero_tend


SUBROUTINE rk_scalar_tend ( scs, sce, &
                            ids, ide, jds, jde, kds, kde,    &
                            ims, ime, jms, jme, kms, kme,    &
                            its, ite, jts, jte, kts, kte    )

#ifdef PINNING
use iso_c_binding
use cuda_alloc
#endif
   IMPLICIT NONE

   !  Input data.

!!!TYPE(grid_config_rec_type   ) TENTIN :: config_flags

   INTEGER  TENTIN :: rk_step, scs, sce
   INTEGER  TENTIN :: ids, ide, jds, jde, kds, kde, &
                                             ims, ime, jms, jme, kms, kme, &
                                             its, ite, jts, jte, kts, kte

   LOGICAL TENTIN :: moist_step

   REAL, DIMENSION(ims:ime, kms:kme, jms:jme , scs:sce )                &
                                         TENTIN  :: scalar_in, scalar_old_in

   REAL, DIMENSION(ims:ime, kms:kme, jms:jme , scs:sce )                      &
                                         TENTINOUT  :: scalar_tends_in
                                                    
#ifndef TEST_ON_GPU_RK
# define IM_DEF
#else
# define IM_DEF ,scs:sce
#endif
   REAL, DIMENSION(ims:ime, kms:kme, jms:jme  IM_DEF ) TENTINOUT :: advect_tend_in

#ifdef PINNING
   REAL, DIMENSION(:,:,:,:), POINTER :: scalar ; type(C_PTR) :: cptr_scalar
   REAL, DIMENSION(:,:,:,:), POINTER :: scalar_old ; type(C_PTR) :: cptr_scalar_old
   REAL, DIMENSION(:,:,:,:), POINTER :: scalar_tends ; type(C_PTR) :: cptr_scalar_tends
#else
   REAL, ALLOCATABLE :: scalar(:,:,:,:)
   REAL, ALLOCATABLE :: scalar_old(:,:,:,:)
   REAL, ALLOCATABLE :: scalar_tends(:,:,:,:)
#endif
#ifndef TEST_ON_GPU_RK
   REAL, ALLOCATABLE :: advect_tend(:,:,:)
#else
   REAL, ALLOCATABLE :: advect_tend(:,:,:,:)
#endif

   REAL, DIMENSION(ims:ime, kms:kme, jms:jme  ) TENTOUT :: RQVFTEN

#ifdef PINNING
   REAL, DIMENSION(:,:,:), POINTER :: ru ; type(C_PTR) :: cptr_ru
   REAL, DIMENSION(:,:,:), POINTER :: rv ; type(C_PTR) :: cptr_rv
   REAL, DIMENSION(:,:,:), POINTER :: ww ; type(C_PTR) :: cptr_ww
   REAL, DIMENSION(:,:,:), POINTER :: xkmhd ; type(C_PTR) :: cptr_xkmhd
   REAL, DIMENSION(:,:,:), POINTER :: alt ; type(C_PTR) :: cptr_alt
   REAL, DIMENSION(ims:ime, kms:kme, jms:jme  ) TENTIN ::     ru_in,  &
                                                                      rv_in,  &
                                                                      ww_in,  &
                                                                      xkmhd_in,  &
                                                                      alt_in
#else
   REAL, DIMENSION(ims:ime, kms:kme, jms:jme  ) TENTIN ::     ru,  &
                                                                      rv,  &
                                                                      ww,  &
                                                                      xkmhd,  &
                                                                      alt
#endif



   REAL , DIMENSION( kms:kme ) TENTIN :: fnm,  &
                                                                  fnp,  &
                                                                  rdn,  &
                                                                  rdnw, &
                                                                  base

   REAL , DIMENSION( ims:ime , jms:jme ) TENTIN :: msfux,    &
                                                                  msfuy,    &
                                                                  msfvx,    &
                                                                  msfvx_inv,    &
                                                                  msfvy,    &
                                                                  msftx,    &
                                                                  msfty,    &
                                                                  mub,     &
                                                                  mut,     &
                                                                  mu_new,  &
                                                                  mu_old

   REAL TENTIN :: rdx,     &
                                                                  rdy,     &
                                                                  khdif,   &
                                                                  kvdif

   INTEGER TENTIN  :: diff_6th_opt
   REAL TENTIN  :: diff_6th_factor

   REAL  TENTIN  :: dt

   LOGICAL TENTIN  :: pd_advection

! added with diffusion and scalar update
REAL TENTIN :: g
INTEGER TENTIN :: spec_zone 

   ! Local data
  
   INTEGER :: im, i,j,k, pd, ounit, res
   INTEGER :: time_step
   DOUBLE PRECISION dmean

#ifdef PINNING
integer, parameter :: fp_kind = kind(0.0)
#endif

   REAL    :: khdq, kvdq, tendency

! from module_model_constants
   REAL    , PARAMETER ::  prandtl = 1./3.0

#ifdef TEST_ON_GPU_RK
    INTEGER s, e, et(300)
    INTEGER, EXTERNAL :: RSL_INTERNAL_MICROCLOCK
# ifdef STANDALONE
    CHARACTER*256 fname
    INTEGER :: thisstep, sim_num_scalars
    COMMON /rk_scalar_tend_block/ thisstep,sim_num_scalars

# else
    CHARACTER*256 fname
    INTEGER, SAVE :: thisstep = 0
# endif

# define IM_INDEX ,im
!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
    moist_step = .true.
    if ( moist_step )then
      thisstep = thisstep + 1
# ifdef OUTPUT_SNAPSHOTS
      write(0,*)__FILE__,__LINE__,thisstep
      write(fname,'("rk_scalar_tend_in_",3i3.3)')thisstep,scs,sce
      open(45,file=fname,form='UNFORMATTED')
      write(45) ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,its,ite,jts,jte,kts,kte
!!!   write(45) config_flags
      write(45) rk_step,scs,sce
      write(45) dt,rdx,rdy,khdif,kvdif
      write(45) g   ! added
      write(45) spec_zone   ! added
      !INOUT 4D
      write(45) scalar(its:ite,kts:kte,jts:jte,scs:sce)
      write(45) scalar_old(its:ite,kts:kte,jts:jte,scs:sce)
      write(45) scalar_tends(its:ite,kts:kte,jts:jte,scs:sce)
      !INOUT 3D
      write(45) advect_tend(its:ite,kts:kte,jts:jte,scs:sce)
      write(45) rqvften(its:ite,kts:kte,jts:jte)
      !IN 1D
      write(45) fnm(kms:kme)
      write(45) fnp(kms:kme)
      write(45) rdn(kms:kme)
      write(45) rdnw(kms:kme)
      write(45) base(kms:kme)
      !IN 2D
      write(45) msfux(its:ite,jts:jte)
      write(45) msfuy(its:ite,jts:jte)
      write(45) msfvx(its:ite,jts:jte)
      write(45) msfvx_inv(its:ite,jts:jte)
      write(45) msfvy(its:ite,jts:jte)
      write(45) msftx(its:ite,jts:jte)
      write(45) msfty(its:ite,jts:jte)
      write(45) mub(its:ite,jts:jte)
      write(45) mut(its:ite,jts:jte)
      write(45) mu_new(its:ite,jts:jte)   ! added
      write(45) mu_old(its:ite,jts:jte)
      !IN 3D
      write(45) ru(its:ite,kts:kte,jts:jte)
      write(45) rv(its:ite,kts:kte,jts:jte)
      write(45) ww(its:ite,kts:kte,jts:jte)
      write(45) xkmhd(its:ite,kts:kte,jts:jte)
      write(45) alt(its:ite,kts:kte,jts:jte)
      close(45)
# endif
# ifdef STANDALONE
      write(fname,'("rk_scalar_tend_in_",3i3.3)')thisstep,scs,sce
write(0,*)'opening again ',fname
      open(45,file=fname,form='UNFORMATTED',err=239)
      goto 240
 239  write(0,*)'cannot open fname ', fname
      stop
 240  continue


!      write(45) ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,ips,ipe,jps,jpe,kps,kpe
!      write(45) rk_step,1, num_3d_m+1-PARAM_FIRST_SCALAR
!      write(45) grid%dt,grid%rdx,grid%rdy,grid%khdif,grid%kvdif
!      write(45) g   ! added
!      write(45) grid%spec_zone   ! added

      read(45) ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,its,ite,jts,jte,kts,kte
      read(45) rk_step,scs,sce
      read(45) dt,rdx,rdy,khdif,kvdif
      read(45) g   ! added
      read(45) spec_zone   ! added
      !INOUT 4D
      read(45) scalar_in(its:ite,kts:kte,jts:jte,scs:sce)
      read(45) scalar_old_in(its:ite,kts:kte,jts:jte,scs:sce)
      read(45) scalar_tends_in(its:ite,kts:kte,jts:jte,scs:sce)

      sim_num_scalars = sce-scs+1
write(0,*)'allocating big scalar arrays for ',sim_num_scalars,' scalars '
#ifdef PINNING
      res = cudaMallocHost( cptr_scalar , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sim_num_scalars*sizeof(fp_kind) )
      res = cudaMallocHost( cptr_scalar_old , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sim_num_scalars*sizeof(fp_kind) )
      res = cudaMallocHost( cptr_scalar_tends , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sim_num_scalars*sizeof(fp_kind) )
# ifdef XPOSE_INPUT
      call c_f_pointer( cptr_scalar , scalar             , (/ime-ims+1,jme-jms+1,kme-kms+1,sim_num_scalars/) )
      call c_f_pointer( cptr_scalar_old , scalar_old     , (/ime-ims+1,jme-jms+1,kme-kms+1,sim_num_scalars/) )
      call c_f_pointer( cptr_scalar_tends , scalar_tends , (/ime-ims+1,jme-jms+1,kme-kms+1,sim_num_scalars/) )

      DO i = 1,sim_num_scalars
        call ikj2ijk(scalar(1,1,1,i),scalar_in(:,:,:,i)            ,ims,ime,jms,jme,kms,kme)
        call ikj2ijk(scalar_old(1,1,1,i),scalar_old_in(:,:,:,i)    ,ims,ime,jms,jme,kms,kme)
        call ikj2ijk(scalar_tends(1,1,1,i),scalar_tends_in(:,:,:,i),ims,ime,jms,jme,kms,kme)
      ENDDO
# else
      call c_f_pointer( cptr_scalar , scalar             , (/ime-ims+1,kme-kms+1,jme-jms+1,sim_num_scalars/) )
      call c_f_pointer( cptr_scalar_old , scalar_old     , (/ime-ims+1,kme-kms+1,jme-jms+1,sim_num_scalars/) )
      call c_f_pointer( cptr_scalar_tends , scalar_tends , (/ime-ims+1,kme-kms+1,jme-jms+1,sim_num_scalars/) )
      scalar      =scalar_in
      scalar_old  =scalar_old_in
      scalar_tends=scalar_tends_in
# endif
#else
      ALLOCATE(scalar_tends(ims:ime,kms:kme,jms:jme,sim_num_scalars))
      ALLOCATE(scalar(ims:ime,kms:kme,jms:jme,sim_num_scalars))
      ALLOCATE(scalar_old(ims:ime,kms:kme,jms:jme,sim_num_scalars))
      ALLOCATE(advect_tend(ims:ime,kms:kme,jms:jme,sim_num_scalars))
      DO i = 1,sim_num_scalars
        scalar(:,:,:,i) = scalar_in(:,:,:,i)
        scalar_old(:,:,:,i) = scalar_old_in(:,:,:,i)
        scalar_tends(:,:,:,i) = scalar_tends_in(:,:,:,i)
        advect_tend(:,:,:,i) = advect_tend_in(:,:,:,i)
      ENDDO
#endif

      !INOUT 3D
      read(45) rqvften(its:ite,kts:kte,jts:jte)
      !IN 1D
      read(45) fnm(kms:kme)
      read(45) fnp(kms:kme)
      read(45) rdn(kms:kme)
      read(45) rdnw(kms:kme)
      read(45) base(kms:kme)
      !IN 2D
      read(45) msfux(its:ite,jts:jte)
      read(45) msfuy(its:ite,jts:jte)
      read(45) msfvx(its:ite,jts:jte)
      read(45) msfvx_inv(its:ite,jts:jte)
      read(45) msfvy(its:ite,jts:jte)
      read(45) msftx(its:ite,jts:jte)
      read(45) msfty(its:ite,jts:jte)
      read(45) mub(its:ite,jts:jte)
      read(45) mut(its:ite,jts:jte)
      read(45) mu_new(its:ite,jts:jte)   ! added
      read(45) mu_old(its:ite,jts:jte)
      !IN 3D
#ifdef PINNING
      read(45) ru_in(its:ite,kts:kte,jts:jte)
      read(45) rv_in(its:ite,kts:kte,jts:jte)
      read(45) ww_in(its:ite,kts:kte,jts:jte)
      read(45) xkmhd_in(its:ite,kts:kte,jts:jte)
      read(45) alt_in(its:ite,kts:kte,jts:jte)

      res = cudaMallocHost( cptr_ru , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sizeof(fp_kind) )
      res = cudaMallocHost( cptr_rv , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sizeof(fp_kind) )
      res = cudaMallocHost( cptr_ww , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sizeof(fp_kind) )
      res = cudaMallocHost( cptr_xkmhd , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sizeof(fp_kind) )
      res = cudaMallocHost( cptr_alt , (ime-ims+1)*(jme-jms+1)*(kme-kms+1)*sizeof(fp_kind) )
# ifdef XPOSE_INPUT
      call c_f_pointer( cptr_ru    , ru    , (/ime-ims+1,jme-jms+1,kme-kms+1/) )
      call c_f_pointer( cptr_rv    , rv    , (/ime-ims+1,jme-jms+1,kme-kms+1/) )
      call c_f_pointer( cptr_ww    , ww    , (/ime-ims+1,jme-jms+1,kme-kms+1/) )
      call c_f_pointer( cptr_xkmhd , xkmhd , (/ime-ims+1,jme-jms+1,kme-kms+1/) )
      call c_f_pointer( cptr_alt   , alt   , (/ime-ims+1,jme-jms+1,kme-kms+1/) )

      call ikj2ijk(ru(1,1,1),ru_in,ims,ime,jms,jme,kms,kme)
      call ikj2ijk(rv(1,1,1),rv_in,ims,ime,jms,jme,kms,kme)
      call ikj2ijk(ww(1,1,1),ww_in,ims,ime,jms,jme,kms,kme)
      call ikj2ijk(xkmhd(1,1,1),xkmhd_in,ims,ime,jms,jme,kms,kme)
      call ikj2ijk(alt(1,1,1),alt_in,ims,ime,jms,jme,kms,kme)
# else
      call c_f_pointer( cptr_ru    , ru    , (/ime-ims+1,kme-kms+1,jme-jms+1/) )
      call c_f_pointer( cptr_rv    , rv    , (/ime-ims+1,kme-kms+1,jme-jms+1/) )
      call c_f_pointer( cptr_ww    , ww    , (/ime-ims+1,kme-kms+1,jme-jms+1/) )
      call c_f_pointer( cptr_xkmhd , xkmhd , (/ime-ims+1,kme-kms+1,jme-jms+1/) )
      call c_f_pointer( cptr_alt   , alt   , (/ime-ims+1,kme-kms+1,jme-jms+1/) )
      ru    = ru_in
      rv    = rv_in
      ww    = ww_in
      xkmhd = xkmhd_in
      alt   = alt_in
# endif
#else
      read(45) ru(its:ite,kts:kte,jts:jte)
      read(45) rv(its:ite,kts:kte,jts:jte)
      read(45) ww(its:ite,kts:kte,jts:jte)
      read(45) xkmhd(its:ite,kts:kte,jts:jte)
      read(45) alt(its:ite,kts:kte,jts:jte)
#endif
      close(45)
# endif
    endif
#else
# define IM_INDEX
#endif



write(0,*)'rk_step ', rk_step
write(0,*)'spec_zone ', spec_zone
write(0,*)'moist_step : ', moist_step

! hard code a couple of things that need to be set that did not get passed 
! in through the files or that do not necessary get passed correctly because 
! of how logicals are represented or cosmic rays (shrug)
   time_step = 1  ! just has to be positive

   pd_advection = .true.

!<DESCRIPTION>
!
! rk_scalar_tend calls routines that computes scalar tendency from advection 
! and 3D mixing (TKE or fixed eddy viscosities).
!
!</DESCRIPTION>

   khdq = khdif/prandtl
   kvdq = kvdif/prandtl

   advect_tend = 0.

   scs = 1 
   sce = sim_num_scalars

et = 0
#ifdef RUN_ON_GPU

   ounit = 91
   CALL gpu_init

#ifdef PINNING
# define REF4D
#else
# define REF4D (ims,kms,jms,1)
#endif


write(0,*)'  call to initialize device memory '
! this call only initializes device memory 
   CALL  rk_scalar_tend_host (                                   &
                    scalar REF4D                                 &
                   ,scalar_old REF4D                             &
                   ,ru, rv, ww, alt, xkmhd                       &
                   , mut, mub                                    &
                   , mu_new , mu_old                             &
                   ,msfux, msfuy, msfvx, msfvy                   &
                   ,msftx, msfty, fnm, fnp                       &
                   ,rdn, rdnw                                    &
                   ,g ,rdx, rdy, dt                              &
                   ,khdif , kvdif                                &
                   ,rk_step , spec_zone                          &
                   ,sce - scs + 1, 1                             &
                   ,1,1 &  ! malloc or update
                   ,ids, ide, jds, jde, kds, kde                 &
                   ,ims, ime, jms, jme, kms, kme                 &
                   ,its, ite, jts, jte, kts, kte                 &
                              )


#if 0
   pd = 1
s = rsl_internal_microclock()

   CALL  rk_scalar_tend_host (                                   &
                    scalar REF4D                                 &
                   ,scalar_old REF4D                             &
                   ,ru, rv, ww, alt, xkmhd                       &
                   , mut, mub                                    &
                   , mu_new , mu_old                             &
                   ,msfux, msfuy, msfvx, msfvy                   &
                   ,msftx, msfty, fnm, fnp                       &
                   ,rdn, rdnw                                    &
                   ,g ,rdx, rdy, dt                              &
                   ,khdif , kvdif                                &
                   ,rk_step , spec_zone                          &
                   ,sce - scs + 1, pd                            &
                   ,0,0 &  ! do not malloc or update
                   ,ids, ide, jds, jde, kds, kde                 &
                   ,ims, ime, jms, jme, kms, kme                 &
                   ,its, ite, jts, jte, kts, kte                 &
                              )

e = rsl_internal_microclock()
#endif

#if 0
et(1) = e-s
! time scalar advection without positive def
   pd = 0
s = rsl_internal_microclock()
   CALL  rk_scalar_tend_host (                                   &
                    scalar REF4D                                 &
                   ,scalar_old REF4D                             &
                   ,ru, rv, ww, alt, xkmhd                       &
                   , mut, mub                                    &
                   , mu_new , mu_old                             &
                   ,msfux, msfuy, msfvx, msfvy                   &
                   ,msftx, msfty, fnm, fnp                       &
                   ,rdn, rdnw                                    &
                   ,g ,rdx, rdy, dt                              &
                   ,khdif , kvdif                                &
                   ,rk_step , spec_zone                          &
                   ,sce - scs + 1, pd                            &
                   ,0,0 &  ! do not malloc or update
                   ,ids, ide, jds, jde, kds, kde                 &
                   ,ims, ime, jms, jme, kms, kme                 &
                   ,its, ite, jts, jte, kts, kte                 &
                              )
e = rsl_internal_microclock()
et(2) = e-s
#endif

#if 1
write(40,*)ite-its,jte-jts,' scalar before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)scalar(i-ims+1,1,j-jms+1,2)
enddo
enddo
write(40,*)ite-its,jte-jts,' scalar_old before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)scalar_old(i-ims+1,1,j-jms+1,2)
enddo
enddo
write(40,*)ite-its,jte-jts,' ru before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)ru(i-ims+1,1,j-jms+1)
enddo
enddo
write(40,*)ite-its,jte-jts,' rv before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)rv(i-ims+1,1,j-jms+1)
enddo
enddo
write(40,*)ite-its,jte-jts,' ww before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)ww(i-ims+1,1,j-jms+1)
enddo
enddo
write(40,*)ite-its,jte-jts,' alt before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)alt(i-ims+1,1,j-jms+1)
enddo
enddo
write(40,*)ite-its,jte-jts,' xkmhd before '
do j=jts,jte-1
do i=its,ite-1
write(40,*)xkmhd(i-ims+1,1,j-jms+1)
enddo
enddo
! time scalar advection with positive def
write(0,*)'  call to do scalar advection with pos def '
   pd = 1
s = rsl_internal_microclock()
   CALL  rk_scalar_tend_host (                                   &
                    scalar REF4D                                 &
                   ,scalar_old REF4D                             &
                   ,ru, rv, ww, alt, xkmhd                       &
                   , mut, mub                                    &
                   , mu_new , mu_old                             &
                   ,msfux, msfuy, msfvx, msfvy                   &
                   ,msftx, msfty, fnm, fnp                       &
                   ,rdn, rdnw                                    &
                   ,g ,rdx, rdy, dt                              &
                   ,khdif , kvdif                                &
                   ,1 , spec_zone                          &
                   ,sce - scs + 1, pd                            &
                   ,0,0 &  ! do not malloc or update
                   ,ids, ide, jds, jde, kds, kde                 &
                   ,ims, ime, jms, jme, kms, kme                 &
                   ,its, ite, jts, jte, kts, kte                 &
                              )
e = rsl_internal_microclock()
et(3) = e-s
write(40,*)ite-its,jte-jts,' scalar after '
do j=jts,jte-1
do i=its,ite-1
write(40,*)scalar(i-ims+1,1,j-jms+1,2)
enddo
enddo
write(40,*)ite-its,jte-jts,' scalar_old after '
do j=jts,jte-1
do i=its,ite-1
write(40,*)scalar_old(i-ims+1,1,j-jms+1,2)
enddo
enddo
#endif


#else

   ounit = 90

! time scalar advection with positive def
s = rsl_internal_microclock()
   DO im = scs, sce
       CALL zero_tend ( advect_tend(ims,kms,jms,im),     &
                      ids, ide, jds, jde, kds, kde, &
                      ims, ime, jms, jme, kms, kme, &
                      its, ite, jts, jte, kts, kte )
       CALL advect_scalar_pd_test       ( scalar(ims,kms,jms,im),        &
                                     scalar_old(ims,kms,jms,im),         &
                                     advect_tend(ims,kms,jms,im),        &
                                     ru, rv, ww, mut, mub, mu_old,       &
                                     msfux, msfuy, msfvx, msfvy,         &
                                     msftx, msfty, fnm, fnp,             &
                                     rdx, rdy, rdnw,dt,                  &
                                     ids, ide, jds, jde, kds, kde,       &
                                     ims, ime, jms, jme, kms, kme,       &
                                     its, ite, jts, jte, kts, kte     )

!! kvdif and khdif are zero so do not call these
       CALL horizontal_diffusion ( 'm', scalar(ims,kms,jms,im),            &
                                        scalar_tends(ims,kms,jms,im), mut, &
                                       ! config_flags,                      &
                                        msfux, msfuy, msfvx, msfvx_inv,    &
                                        msfvy, msftx, msfty,               &
                                        khdq , xkmhd, rdx, rdy,            &
                                        ids, ide, jds, jde, kds, kde,      &
                                        ims, ime, jms, jme, kms, kme,      &
                                        its, ite, jts, jte, kts, kte      )

#if 0
            CALL vertical_diffusion (  'm', scalar(ims,kms,jms,im),       &
                                            scalar_tends(ims,kms,jms,im), &
                                            !config_flags,                 &
                                             g, &
                                            alt, mut, rdn, rdnw, kvdq,    &
                                            ids, ide, jds, jde, kds, kde, &
                                            ims, ime, jms, jme, kms, kme, &
                                            its, ite, jts, jte, kts, kte )

#endif
            CALL rk_update_scalar( 1, 1,                      &
                                   scalar_old(ims,kms,jms,im),    &
                                   scalar(ims,kms,jms,im),        &
                                   scalar_tends(ims,kms,jms,im),  &
                                   advect_tend(ims,kms,jms,im),   &
                                   msftx, msfty,     &
                                   mu_old, mu_new, mub,       &
                                   1, dt, spec_zone,        &
                                   !config_flags,                  &
                                   ids, ide, jds, jde, kds, kde,  &
                                   ims, ime, jms, jme, kms, kme,  &
                                   its, ite, jts, jte, kts, kte  )


   ENDDO
e = rsl_internal_microclock()
et(3) = e-s
   
#endif

do im=1,3
write(0,*)'timings: ',et(im)
enddo

#if defined(OUTPUT_SNAPSHOTS) || defined(STANDALONE)
# if defined(STANDALONE)
#   if ( defined(RUN_ON_GPU) )
write(fname,'("snap_gpu_",i3.3)')thisstep
#   else
write(fname,'("snap_out_",i3.3)')thisstep
#   endif
# endif
open(46,file=fname,form='UNFORMATTED')
write(46) ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,its,ite,jts,jte,kts,kte
write(46) scs,sce
!INOUT 4D

DO im = scs, sce
  do k=kts,kte
  dmean = 0.
  if (k .eq. k .and. im .eq. scs ) write(ounit,*)ite-its-2,jte-jts-2,'scalar ',im,' level ',k
  do j=jts+1,jte-2
  do i=its+1,ite-2
#ifdef PINNING
#  ifdef XPOSE_INPUT
    dmean = dmean + scalar(i-ims+1,j-jms+1,k-kms+1,im)
    if ( k .eq. k .and. im .eq. scs ) write(ounit,*)scalar(i-ims+1,j-jms+1,k-kms+1,im)
#  else
    dmean = dmean + scalar(i-ims+1,k-kms+1,j-jms+1,im)
    if ( k .eq. k .and. im .eq. scs ) write(ounit,*)scalar(i-ims+1,k-kms+1,j-jms+1,im)
#  endif
#else
    dmean = dmean + scalar(i,k,j,im)
    if ( k .eq. 1 .and. im .eq. scs ) write(ounit,*)scalar(i,k,j,im)
#endif
  enddo
  enddo
!  if ( im .eq. 1 ) write(0,*)im, k,' dmean ',dmean
   write(0,*)im, k,' dmean ',dmean  
  enddo
#ifdef PINNING
!  write(46) scalar(its:ite,jts:jte,kts:kte,im)
#else
  write(46) scalar(its:ite,kts:kte,jts:jte,im)
#endif
enddo
!INOUT 3D
close(46)
#endif

END SUBROUTINE rk_scalar_tend

#ifndef RUN_ON_GPU
# include "hdiff_vanilla.inc"
# include "vdiff_vanilla.inc"
# include "update_vanilla.inc"
#endif

END MODULE module_em

subroutine ikj2ijk(d,s,ims,ime,jms,jme,kms,kme)
implicit none
integer i,j,k,ims,ime,jms,jme,kms,kme
real, dimension(ims:ime,jms:jme,kms:kme) :: d
real, dimension(ims:ime,kms:kme,jms:jme) :: s
do j=jms,jme
do k=kms,kme
do i=ims,ime
  d(i,j,k) = s(i,k,j)
enddo
enddo
enddo
end subroutine ikj2ijk

subroutine ijk2ikj(d,s,ims,ime,jms,jme,kms,kme)
implicit none
integer i,j,k,ims,ime,jms,jme,kms,kme
real, dimension(ims:ime,kms:kme,jms:jme) :: d
real, dimension(ims:ime,jms:jme,kms:kme) :: s
do k=kms,kme
do j=jms,jme
do i=ims,ime
  d(i,k,j) = s(i,j,k)
enddo
enddo
enddo
end subroutine ijk2ikj



#ifdef STANDALONE

  PROGRAM rk_scalar_tend_driver
    USE module_em
    IMPLICIT NONE
    INTEGER thisstep,scs,sce,sim_num_scalars
    COMMON /rk_scalar_tend_block/ thisstep,sim_num_scalars
    CHARACTER*80 fname
    INTEGER ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,its,ite,jts,jte,kts,kte

    print *,'Step number? '
    read(*,*) thisstep
    scs = 1 ; sce = 81

    write(fname,'("rk_scalar_tend_in_",3i3.3)')thisstep,scs,sce
    print*,'opening ',trim(fname)
    open(45,file=fname,form='UNFORMATTED')
    read(45) ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,its,ite,jts,jte,kts,kte
    write(0,*)'ids,ide,jds,jde,kds,kde ',ids,ide,jds,jde,kds,kde
    write(0,*)'ims,ime,jms,jme,kms,kme ',ims,ime,jms,jme,kms,kme
    write(0,*)'its,ite,jts,jte,kts,kte ',its,ite,jts,jte,kts,kte
    close(45)
    thisstep = thisstep - 1
    CALL rk_scalar_tend( scs,sce,ids,ide,jds,jde,kds,kde,ims,ime,jms,jme,kms,kme,its,ite,jts,jte,kts,kte )

    stop
  END PROGRAM rk_scalar_tend_driver

#endif
