#define ASYNC
#define NBUFFERS 2
//#define NBUFFERS 3

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas.h>
#include "cutil.h"
#include <string.h>

#ifndef CRAY
# ifdef NOUNDERSCORE
#      define RK_SCALAR_TEND_HOST rk_scalar_tend_host
#      define GET_WRF_GPU_LEVELS  get_wrf_gpu_levels
#      define GPU_INIT gpu_init
# else
#   ifdef F2CSTYLE
#      define RK_SCALAR_TEND_HOST rk_scalar_tend_host__
#      define GET_WRF_GPU_LEVELS  get_wrf_gpu_levels__
#      define GPU_INIT gpu_init__
#   else
#      define RK_SCALAR_TEND_HOST rk_scalar_tend_host_
#      define GET_WRF_GPU_LEVELS  get_wrf_gpu_levels_
#      define GPU_INIT gpu_init_
#   endif
# endif
#endif

#define I2(i,j,m) ((i)+((j)*(m)))
#define I3(i,j,m,k,n) (I2(i,j,m)+((k)*(m)*(n)))
#define I4(i,j,m,k,n,l,o) (I3(i,j,m,k,n)+((l)*(m)*(n)*(o)))

# define TODEV(A,s) static float *A##_d;if (first) {cudaMalloc((void**)&A##_d,((s))*sizeof(float));};cudaMemcpy(A##_d,A,(s)*sizeof(float),cudaMemcpyHostToDevice);
# define FROMDEV(A,s) cudaMemcpy(A,A##_d,(s)*sizeof(float),cudaMemcpyDeviceToHost);
# define CLNUP(A) cudaFree(A##_d)

# define TODEV4(A) TODEV(A,d4)
# define TODEV3(A) TODEV(A,d3)
# define TODEV2(A) TODEV(A,d2)
# define FROMDEV3(A) FROMDEV(A,d3)
# define FROMDEV2(A) FROMDEV(A,d2)

extern "C" int rsl_internal_microclock_() ;

// -- define textures --

texture <float, 3, cudaReadModeElementType> scalar_A_tex ;
texture <float, 3, cudaReadModeElementType> scalar_old_A_tex ;
texture <float, 3, cudaReadModeElementType> scalar_B_tex ;
texture <float, 3, cudaReadModeElementType> scalar_old_B_tex ;
#if (NBUFFERS == 3)
texture <float, 3, cudaReadModeElementType> scalar_C_tex ;
texture <float, 3, cudaReadModeElementType> scalar_old_C_tex ;
#endif

texture <float, 3, cudaReadModeElementType> ru_tex ;
texture <float, 3, cudaReadModeElementType> rv_tex ;
texture <float, 3, cudaReadModeElementType> ww_tex ;
texture <float, 2, cudaReadModeElementType> msfux_tex ;
texture <float, 2, cudaReadModeElementType> msfuy_tex ;
texture <float, 2, cudaReadModeElementType> msfvx_tex ;
texture <float, 2, cudaReadModeElementType> msfvy_tex ;
texture <float, 2, cudaReadModeElementType> msftx_tex ;
texture <float, 2, cudaReadModeElementType> msfty_tex ;
texture <float, 2, cudaReadModeElementType> mut_tex ;
texture <float, 2, cudaReadModeElementType> mub_tex ;
texture <float, 2, cudaReadModeElementType> mu_new_tex ;
texture <float, 2, cudaReadModeElementType> mu_old_tex ;

// added for diffusion
texture <float, 3, cudaReadModeElementType> alt_tex ;
texture <float, 3, cudaReadModeElementType> xkmhd_tex ;


//----------------------------------------------------------------------------
//                 D E V I C E   C O D E 
//----------------------------------------------------------------------------
// SPTSTART
extern "C" {
#define IDEBUG ((DEBUG_I)-1)
#define JDEBUG ((DEBUG_J)-1)
#define KDEBUG ((DEBUG_K)-1)

#ifdef DEVICEEMU
# undef TexRef2D
# define TexRef2D(A,I,J) A[P2(I,J)]
#endif

// this is an M4 include
include(debug.m4)

#include "spt.h"

// This bit of trickery needed for double buffering because there's 
// no way to defined a settable texture variable or pointer to a texture 
// and rebinding before each kernel invocation is expensive

#  define scalar_tex scalar_A_tex
#  define scalar_old_tex scalar_old_A_tex
#  define rk_scalar_tend_part1 rk_scalar_tend_part1_A
#include "part1.inc"
#  undef scalar_tex
#  undef scalar_old_tex
#  undef rk_scalar_tend_part1
#  define scalar_tex scalar_B_tex
#  define scalar_old_tex scalar_old_B_tex
#  define rk_scalar_tend_part1 rk_scalar_tend_part1_B
#include "part1.inc"
#  undef rk_scalar_tend_part1
#  undef scalar_tex
#  undef scalar_old_tex
#if (NBUFFERS == 3)
#  define scalar_tex scalar_C_tex
#  define scalar_old_tex scalar_old_C_tex
#  define rk_scalar_tend_part1 rk_scalar_tend_part1_C
#include "part1.inc"
#  undef rk_scalar_tend_part1
#  undef scalar_tex
#  undef scalar_old_tex
#endif

#  undef rk_scalar_tend_part3
#  define scalar_tex scalar_A_tex
#  define scalar_old_tex scalar_old_A_tex
#  define rk_scalar_tend_part3 rk_scalar_tend_part3_A
#include "part3.inc"
#  undef scalar_tex
#  undef scalar_old_tex
#  undef rk_scalar_tend_part3
#  define scalar_tex scalar_B_tex
#  define scalar_old_tex scalar_old_B_tex
#  define rk_scalar_tend_part3 rk_scalar_tend_part3_B
#include "part3.inc"
#  undef rk_scalar_tend_part3
#  undef scalar_tex
#  undef scalar_old_tex
#if (NBUFFERS == 3)
#  define scalar_tex scalar_C_tex
#  define scalar_old_tex scalar_old_C_tex
#  define rk_scalar_tend_part3 rk_scalar_tend_part3_C
#include "part3.inc"
#  undef rk_scalar_tend_part3
#  undef scalar_tex
#  undef scalar_old_tex
#endif

#  define scalar_tex scalar_A_tex
#  define diffusion_m diffusion_m_A
#  include "diffusion.inc"
#  undef scalar_tex
#  undef diffusion_m
#  define scalar_tex scalar_B_tex
#  define diffusion_m diffusion_m_B
#  include "diffusion.inc"
#  undef scalar_tex
#  undef diffusion_m
#if (NBUFFERS == 3)
#  define scalar_tex scalar_C_tex
#  define diffusion_m diffusion_m_C
#  include "diffusion.inc"
#  undef scalar_tex
#  undef diffusion_m
#endif

#  undef scalar_tex
#  undef scalar_old_tex
#  undef rk_scalar_tend_part1
#  undef rk_scalar_tend_part3

#  define scalar_tex scalar_A_tex
#  define update_scalar_gpu update_scalar_gpu_A
#  include "update_scalar.inc"
#  undef scalar_tex
#  undef update_scalar_gpu
#  define scalar_tex scalar_B_tex
#  define update_scalar_gpu update_scalar_gpu_B
#  include "update_scalar.inc"
#  undef scalar_tex
#  undef update_scalar_gpu
#if (NBUFFERS == 3)
#  define scalar_tex scalar_C_tex
#  define update_scalar_gpu update_scalar_gpu_C
#  include "update_scalar.inc"
#  undef scalar_tex
#  undef update_scalar_gpu
#endif

}
// SPTSTOP


//----------------------------------------------------------------------------
//                     H O S T   C O D E 
//----------------------------------------------------------------------------

extern "C" {

int
GPU_INIT ( )
{
   float x, *x_d ;
   int s, e ;
   s=rsl_internal_microclock_() ;
   cudaMalloc((void **)&x_d,sizeof(float)) ;
   cudaMemcpy(x_d,&x,sizeof(float),cudaMemcpyHostToDevice) ;
   cudaFree(x_d) ;
   e=rsl_internal_microclock_() ;
   fprintf(stderr,"gpu_init: %d\n",e-s) ;
   return(0) ;
}

int
RK_SCALAR_TEND_HOST (
                    float * scalar                                    // 4d in
                   ,float * scalar_old                                // 4d in
                   ,float * ru, float * rv, float * ww                // 3d in
                   ,float * alt, float * xkmhd                        // 3d in
                   ,float * mut, float * mub                          // 2d in
                   ,float * mu_new , float * mu_old                   // 2d in
                   ,float * msfux, float * msfuy, float * msfvx       // 2d in
                   ,float * msfvy, float * msftx, float * msfty       // 2d in
                   ,float * fzm, float * fzp                          // 1d in
                   ,float * rdz , float * rdzw                        // 1d in
                   ,float * g, float * rdx, float * rdy, float * dt   // 0d in
                   ,float * khdif, float * kvdif                      // 0d in
                   ,int * rk_step , int * spec_zone                   // 0d in
                   ,int * num_scalars, int * pd                       // 0d in
                   ,int * first0                                      // =1 malloc device mem, =0 do not
                   ,int * update_read_only_vars0                      // =1 update u, v, etc., =0 do not         
                   ,int *ids0, int *ide0,  int *jds0, int *jde0,  int *kds0, int *kde0
                   ,int *ims0, int *ime0,  int *jms0, int *jme0,  int *kms0, int *kme0
                   ,int *ips0, int *ipe0,  int *jps0, int *jpe0,  int *kps0, int *kpe0
          )
{
#if 0
fprintf(stderr,"g %e\n",*g);
fprintf(stderr,"rdx %e\n",*rdx);
fprintf(stderr,"rdy %e\n",*rdy);
fprintf(stderr,"khdif %e\n",*khdif);
fprintf(stderr,"dt %e\n",*dt);
fprintf(stderr,"rk_step %d\n",*rk_step) ;
fprintf(stderr,"spec_zone %d\n",*spec_zone) ;
fprintf(stderr,"num_scalars %d\n",*num_scalars) ;
fprintf(stderr,"pd %d\n",*pd) ;
fprintf(stderr,"first0 %d\n",*first0) ;
fprintf(stderr,"update_read_only_vars0 %d\n",*update_read_only_vars0) ;
#endif
 
      int first= *first0 ;
      int update_ro = *update_read_only_vars0 ;
      float * p, *pold ;
      cudaArray *p_d, *pold_d ;

      int im = -1 ;
//#define ZTIME_ON
#ifdef ZTIME_ON
#define SLOTS 20
#else
#define SLOTS 1
#endif
#define VALS  2
#ifdef ZTIME_ON
int ss1, ee1 ;
int z[VALS][SLOTS] ;
char label[SLOTS][256] ;
#define ZTIME_S       ss1 = rsl_internal_microclock_()
#define ZTIME_E(ZONE,LAB) ee1 = rsl_internal_microclock_() ; strcpy( label[ZONE], LAB ) ; z[0][ZONE] ++ ; z[1][ZONE] += ee1 - ss1 ; ss1 = ee1 ;

#else
#define ZTIME_S
#define ZTIME_E(ZONE,LAB)
#endif

//fprintf(stderr,"%s %d rk_step %d\n",__FILE__,__LINE__,*rk_step) ;

      int ids=*ids0-1; int ide=*ide0-1; int jds=*jds0-1; int jde=*jde0-1; int kds=*kds0-1; int kde=*kde0-1 ;
      int ims=*ims0-1; int ime=*ime0-1; int jms=*jms0-1; int jme=*jme0-1; int kms=*kms0-1; int kme=*kme0-1 ;
      int ips=*ips0-1; int ipe=*ipe0-1; int jps=*jps0-1; int jpe=*jpe0-1; int kps=*kps0-1; int kpe=*kpe0-1 ;

      int d2 = (ime-ims+1) * (jme-jms+1) ;
      int d3 = d2 * (kme-kms+1) ;
      int d4 = d3 * (*num_scalars) ;

//if ( (! *first0 ) && *num_scalars > 10 ) {
//fprintf(stderr,"ZZZ %d %d scalar %lu\n",ipe-ips,jpe-jps, scalar) ;
//for ( j = 0 ; j < jpe-jps ; j++ ) {
//for ( i = 0 ; i < ipe-jps ; i++ ) {
//fprintf(stderr,"ZZZ %e\n",scalar[(i-ims)+(j-jms)*(ime-ims+1)*(kme-kms+1)]) ;
//} }
//fprintf(stderr,"ZZZ %d %d scalar_old %lu\n",ipe-ips,jpe-jps, scalar_old) ;
//for ( j = 0 ; j < jpe-jps ; j++ ) {
//for ( i = 0 ; i < ipe-jps ; i++ ) {
//fprintf(stderr,"ZZZ %e\n",scalar_old[(i-ims)+(j-jms)*(ime-ims+1)*(kme-kms+1)]) ;
//} }
//}

//fprintf(stderr,"inside rk_scalar_tend_host\n") ;
//fprintf(stderr,"d4 %d\n",d4 ) ;
//fprintf(stderr,"d3 %d\n",d3 ) ;
//fprintf(stderr,"d2 %d\n",d2 ) ;
//for ( i=0 ; i<d3; i++ ){
//fprintf(stderr,"> %d %f %ld %ld\n",i,ru[i],ru,&(ru[i])) ;
//}
//exit(0);

      //int dips = 0 ; int dipe = (ipe-ips+1) ; int dime = (ime-ims+1) ;
      //int djps = 0 ; int djpe = (jpe-jps+1) ; int djme = (jme-jms+1) ;
      //int dkps = 0 ; int dkpe = (kpe-kps+1) ; int dkme = (kme-kms+1) ;
      int dime = (ime-ims+1) ;
      int djme = (jme-jms+1) ;
      int dkme = (kme-kms+1) ;

//fprintf(stderr,"ims %d ime %d dime %d\n",ims,ime,dime ) ;
//fprintf(stderr,"jms %d jme %d djme %d\n",jms,jme,djme ) ;

#ifdef ZTIME_ON
for ( i = 0 ; i< VALS ; i++ ) for ( j = 0 ; j< SLOTS ; j++ ) { z[i][j] = 0 ; }
#endif

      //fprintf(stderr,"before mallocs first %d update_ro %d pd %d \n",first, update_ro, *pd) ;

      ZTIME_S ;
      // the rest are input arrays... put them in texture memory
      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat) ;

#ifndef DEVICEEMU
# define WRF_TEX_2D(NAME) \
      static cudaArray * NAME##_d ; \
      if ( first ) {CUDA_SAFE_CALL(cudaMallocArray( &(NAME##_d) , &channelDesc , dime , djme ));} \
      if ( update_ro ) { CUDA_SAFE_CALL(cudaMemcpy2DToArray( NAME##_d, 0, 0, NAME, dime*sizeof(float), \
                     dime*sizeof(float), djme, cudaMemcpyHostToDevice )) ; \
      NAME##_tex.addressMode[0] = cudaAddressModeClamp ; \
      NAME##_tex.addressMode[1] = cudaAddressModeClamp ; \
      NAME##_tex.filterMode = cudaFilterModePoint ; \
      NAME##_tex.normalized = false ; \
      CUDA_SAFE_CALL( cudaBindTextureToArray( NAME##_tex, NAME##_d, channelDesc ) ) ; }
#else
# define WRF_TEX_2D(NAME) \
      static float * NAME##_d ; \
      if ( first ) {CUDA_SAFE_CALL(cudaMalloc( (void **)&(NAME##_d) , djme * dime * sizeof(float)));} ; \
      CUDA_SAFE_CALL(cudaMemcpy( NAME##_d, NAME, dime*djme*sizeof(float), cudaMemcpyHostToDevice )) ;
#endif

      WRF_TEX_2D(msftx) ;
      WRF_TEX_2D(msfty) ;
      WRF_TEX_2D(msfux) ;
      WRF_TEX_2D(msfuy) ;
      WRF_TEX_2D(msfvx) ;
      WRF_TEX_2D(msfvy) ;
      WRF_TEX_2D(mut) ;
      WRF_TEX_2D(mub) ;
      WRF_TEX_2D(mu_old) ;
      WRF_TEX_2D(mu_new) ;
      TODEV(fzm, (kde-kds+1) ) ;
      TODEV(fzp, (kde-kds+1) ) ;
      TODEV(rdz, (kde-kds+1) ) ;
      TODEV(rdzw, (kde-kds+1) ) ;
      ZTIME_E(0, "2D_mallocs_and_binds" ) ;

      ZTIME_S ;


      struct cudaExtent extent ;
#ifdef XPOSE_INPUT
      extent.width = dime ; extent.height = djme ; extent.depth = dkme ;
#else
      extent.width = dime ; extent.height = dkme ; extent.depth = djme ;
#endif

      static cudaArray * scalar_A_d     ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(scalar_A_d)    , &channelDesc ,extent));} ;
      static cudaArray * scalar_old_A_d ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(scalar_old_A_d), &channelDesc ,extent));} ;
      static cudaArray * scalar_B_d     ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(scalar_B_d)    , &channelDesc ,extent));} ;
      static cudaArray * scalar_old_B_d ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(scalar_old_B_d), &channelDesc ,extent));} ;

      CUDA_SAFE_CALL( cudaBindTextureToArray( scalar_A_tex  , scalar_A_d  ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( scalar_B_tex  , scalar_B_d  ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( scalar_old_A_tex  , scalar_old_A_d  ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( scalar_old_B_tex  , scalar_old_B_d  ) ) ;

#if (NBUFFERS == 3)
      static cudaArray * scalar_old_C_d ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(scalar_old_C_d), &channelDesc ,extent));} ;
      static cudaArray * scalar_C_d     ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(scalar_C_d)    , &channelDesc ,extent));} ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( scalar_C_tex  , scalar_C_d  ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( scalar_old_C_tex  , scalar_old_C_d  ) ) ;
#endif

      static cudaArray * ru_d  ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(ru_d) , &channelDesc , extent ));} ;
      static cudaArray * rv_d  ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(rv_d) , &channelDesc , extent ));} ;
      static cudaArray * ww_d  ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(ww_d) , &channelDesc , extent ));} ;
      static cudaArray * alt_d  ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(alt_d) , &channelDesc , extent ));} ;
      static cudaArray * xkmhd_d  ; if (first){CUDA_SAFE_CALL(cudaMalloc3DArray( &(xkmhd_d) , &channelDesc , extent ));} ;

      CUDA_SAFE_CALL( cudaBindTextureToArray( ru_tex , ru_d ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( rv_tex , rv_d ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( ww_tex , ww_d ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( alt_tex , alt_d ) ) ;
      CUDA_SAFE_CALL( cudaBindTextureToArray( xkmhd_tex , xkmhd_d ) ) ;

#ifdef PINNING
      static float * ru_pinned    ;
      static float * rv_pinned    ;
      static float * ww_pinned    ;
      static float * alt_pinned    ;
      static float * xkmhd_pinned    ;
#else
      static float * scalar_A     ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&scalar_A     , d3*sizeof(float) ) );} ;
      static float * scalar_old_A ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&scalar_old_A , d3*sizeof(float) ) );} ;
      static float * scalar_B     ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&scalar_B     , d3*sizeof(float) ) );} ;
      static float * scalar_old_B ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&scalar_old_B , d3*sizeof(float) ) );} ;
#if (NBUFFERS == 3)
      static float * scalar_C     ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&scalar_C     , d3*sizeof(float) ) );} ;
      static float * scalar_old_C ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&scalar_old_C , d3*sizeof(float) ) );} ;
# endif
      static float * ru_pinned    ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&ru_pinned    , d3*sizeof(float) ) );} ;
      static float * rv_pinned    ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&rv_pinned    , d3*sizeof(float) ) );} ;
      static float * ww_pinned    ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&ww_pinned    , d3*sizeof(float) ) );} ;
      static float * alt_pinned    ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&alt_pinned    , d3*sizeof(float) ) );} ;
      static float * xkmhd_pinned    ; if (first){CUDA_SAFE_CALL( cudaMallocHost( (void **)&xkmhd_pinned    , d3*sizeof(float) ) );} ;
#endif

      static float * scalar_1_d ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **)&scalar_1_d , d4*sizeof(float) ) );} ;
      static float * scalar_2_d ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **)&scalar_2_d , d4*sizeof(float) ) );} ;

      static float * fqx_A_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqx_A_d  , d3*sizeof(float) ));}; 
      static float * fqy_A_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqy_A_d  , d3*sizeof(float) ));}; 
      static float * fqz_A_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqz_A_d  , d3*sizeof(float) ));}; 
      static float * fqxl_A_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqxl_A_d , d3*sizeof(float) ));}; 
      static float * fqyl_A_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqyl_A_d , d3*sizeof(float) ));}; 
      static float * fqzl_A_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqzl_A_d , d3*sizeof(float) ));}; 

      static float * fqx_B_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqx_B_d  , d3*sizeof(float) ));}; 
      static float * fqy_B_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqy_B_d  , d3*sizeof(float) ));}; 
      static float * fqz_B_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqz_B_d  , d3*sizeof(float) ));}; 
      static float * fqxl_B_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqxl_B_d , d3*sizeof(float) ));}; 
      static float * fqyl_B_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqyl_B_d , d3*sizeof(float) ));}; 
      static float * fqzl_B_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqzl_B_d , d3*sizeof(float) ));}; 

#if (NBUFFERS == 3)
      static float * fqx_C_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqx_C_d  , d3*sizeof(float) ));}; 
      static float * fqy_C_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqy_C_d  , d3*sizeof(float) ));}; 
      static float * fqz_C_d         ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqz_C_d  , d3*sizeof(float) ));}; 
      static float * fqxl_C_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqxl_C_d , d3*sizeof(float) ));}; 
      static float * fqyl_C_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqyl_C_d , d3*sizeof(float) ));}; 
      static float * fqzl_C_d        ; if (first){CUDA_SAFE_CALL( cudaMalloc( (void **) &fqzl_C_d , d3*sizeof(float) ));}; 
#endif

      static float * advect_tend_d ; if(first){CUDA_SAFE_CALL( cudaMalloc( (void **) &advect_tend_d , d4*sizeof(float) ));};
      static float * scalar_tends_d ; if(first){CUDA_SAFE_CALL( cudaMalloc( (void **) &scalar_tends_d , d4*sizeof(float) ));};
static float * debuggal_d ; if(first){CUDA_SAFE_CALL(cudaMalloc( (void **) &debuggal_d , 10*d2*sizeof(float) ));};
static float * debuggal   ; if(first){CUDA_SAFE_CALL(cudaMallocHost( (void **)&debuggal , 10*d2*sizeof(float) ) );} ;

      first = 0 ;

      ZTIME_E(1, "mallocs" ) ;
   if ( ! first ) { 
      ZTIME_S ;

      struct cudaMemcpy3DParms parms3D = {0} ;

    if ( update_ro ) {
#ifdef PINNING
      ru_pinned = ru ;
#else
      p = ru_pinned ;
      for ( k = 0 ; k < dkme ; k++ ) {
        for ( j = 0 ; j < djme ; j++ ) {
	  for ( i = 0 ; i < dime ; i++ ) {
            *p++ = ru[I3(i,k,dime,j,dkme)] ;
          }
        }
      }
#endif
#ifdef XPOSE_INPUT
# define dscnd djme
# define dthrd dkme
#else
# define dscnd dkme
# define dthrd djme
#endif
      bzero(&parms3D,sizeof(struct cudaMemcpy3DParms)) ;
      parms3D.srcPtr.ptr = ru_pinned ;
      parms3D.srcPtr.pitch = dime*sizeof(float) ;
      parms3D.srcPtr.xsize = dime*sizeof(float) ;
      parms3D.srcPtr.ysize = dscnd ;
      parms3D.srcPos.x = 0 ; parms3D.srcPos.y = 0 ; parms3D.srcPos.z = 0 ;
      parms3D.dstArray = ru_d ;
      parms3D.dstPos.x = 0 ; parms3D.dstPos.y = 0 ; parms3D.dstPos.z = 0 ;
      parms3D.extent.width = dime ; parms3D.extent.height = dscnd ; parms3D.extent.depth = dthrd ;
      parms3D.kind = cudaMemcpyHostToDevice ;
      CUDA_SAFE_CALL(cudaMemcpy3D(&parms3D)) ;

#ifdef PINNING
      rv_pinned = rv ;
#else
      p = rv_pinned ;
      for ( k = 0 ; k < dkme ; k++ ) {
        for ( j = 0 ; j < djme ; j++ ) {
	  for ( i = 0 ; i < dime ; i++ ) {
            *p++ = rv[I3(i,k,dime,j,dkme)] ;
          }
        }
      }
#endif
      bzero(&parms3D,sizeof(struct cudaMemcpy3DParms)) ;
      parms3D.srcPtr.ptr = rv_pinned ;
      parms3D.srcPtr.pitch = dime*sizeof(float) ;
      parms3D.srcPtr.xsize = dime*sizeof(float) ;
      parms3D.srcPtr.ysize = dscnd ;
      parms3D.srcPos.x = 0 ; parms3D.srcPos.y = 0 ; parms3D.srcPos.z = 0 ;
      parms3D.dstArray = rv_d ;
      parms3D.dstPos.x = 0 ; parms3D.dstPos.y = 0 ; parms3D.dstPos.z = 0 ;
      parms3D.extent.width = dime ; parms3D.extent.height = dscnd ; parms3D.extent.depth = dthrd ;
      parms3D.kind = cudaMemcpyHostToDevice ;
      CUDA_SAFE_CALL(cudaMemcpy3D(&parms3D)) ;

#ifdef PINNING
      ww_pinned = ww ;
#else
      p = ww_pinned ;
      for ( k = 0 ; k < dkme ; k++ ) {
        for ( j = 0 ; j < djme ; j++ ) {
	  for ( i = 0 ; i < dime ; i++ ) {
            *p++ = ww[I3(i,k,dime,j,dkme)] ;
          }
        }
      }
#endif
      bzero(&parms3D,sizeof(struct cudaMemcpy3DParms)) ;
      parms3D.srcPtr.ptr = ww_pinned ;
      parms3D.srcPtr.pitch = dime*sizeof(float) ;
      parms3D.srcPtr.xsize = dime*sizeof(float) ;
      parms3D.srcPtr.ysize = dscnd ;
      parms3D.srcPos.x = 0 ; parms3D.srcPos.y = 0 ; parms3D.srcPos.z = 0 ;
      parms3D.dstArray = ww_d ;
      parms3D.dstPos.x = 0 ; parms3D.dstPos.y = 0 ; parms3D.dstPos.z = 0 ;
      parms3D.extent.width = dime ; parms3D.extent.height = dscnd ; parms3D.extent.depth = dthrd ;
      parms3D.kind = cudaMemcpyHostToDevice ;
      CUDA_SAFE_CALL(cudaMemcpy3D(&parms3D)) ;

#ifdef PINNING
      alt_pinned = alt ;
#else
      p = alt_pinned ;
      for ( k = 0 ; k < dkme ; k++ ) {
        for ( j = 0 ; j < djme ; j++ ) {
	  for ( i = 0 ; i < dime ; i++ ) {
            *p++ = alt[I3(i,k,dime,j,dkme)] ;
          }
        }
      }
#endif
      bzero(&parms3D,sizeof(struct cudaMemcpy3DParms)) ;
      parms3D.srcPtr.ptr = alt_pinned ;
      parms3D.srcPtr.pitch = dime*sizeof(float) ;
      parms3D.srcPtr.xsize = dime*sizeof(float) ;
      parms3D.srcPtr.ysize = dscnd ;
      parms3D.srcPos.x = 0 ; parms3D.srcPos.y = 0 ; parms3D.srcPos.z = 0 ;
      parms3D.dstArray = alt_d ;
      parms3D.dstPos.x = 0 ; parms3D.dstPos.y = 0 ; parms3D.dstPos.z = 0 ;
      parms3D.extent.width = dime ; parms3D.extent.height = dscnd ; parms3D.extent.depth = dthrd ;
      parms3D.kind = cudaMemcpyHostToDevice ;
      CUDA_SAFE_CALL(cudaMemcpy3D(&parms3D)) ;

#ifdef PINNING
      xkmhd_pinned = xkmhd ;
#else
      p = xkmhd_pinned ;
      for ( k = 0 ; k < dkme ; k++ ) {
        for ( j = 0 ; j < djme ; j++ ) {
	  for ( i = 0 ; i < dime ; i++ ) {
            *p++ = xkmhd[I3(i,k,dime,j,dkme)] ;
          }
        }
      }
#endif
      bzero(&parms3D,sizeof(struct cudaMemcpy3DParms)) ;
      parms3D.srcPtr.ptr = xkmhd_pinned ;
      parms3D.srcPtr.pitch = dime*sizeof(float) ;
      parms3D.srcPtr.xsize = dime*sizeof(float) ;
      parms3D.srcPtr.ysize = dscnd ;
      parms3D.srcPos.x = 0 ; parms3D.srcPos.y = 0 ; parms3D.srcPos.z = 0 ;
      parms3D.dstArray = xkmhd_d ;
      parms3D.dstPos.x = 0 ; parms3D.dstPos.y = 0 ; parms3D.dstPos.z = 0 ;
      parms3D.extent.width = dime ; parms3D.extent.height = dscnd ; parms3D.extent.depth = dthrd ;
      parms3D.kind = cudaMemcpyHostToDevice ;
      CUDA_SAFE_CALL(cudaMemcpy3D(&parms3D)) ;

    } // update_r0

      ZTIME_E(2, "malloc_ru_rv") ;
      ZTIME_S ;

      // for async
#ifdef ASYNC
      cudaStream_t AA , BB, CC, *AA_ptr, *BB_ptr, *CC_ptr ;
      cudaStream_t stream ;
      AA_ptr = &AA ; CUDA_SAFE_CALL( cudaStreamCreate( AA_ptr )) ;
      BB_ptr = &BB ; CUDA_SAFE_CALL( cudaStreamCreate( BB_ptr )) ;
      CC_ptr = &CC ; CUDA_SAFE_CALL( cudaStreamCreate( CC_ptr )) ;
#endif

#ifndef XXX
#  define XXX 4
#endif
#ifndef YYY
#  define YYY 4
#endif

      int remx, remy ;  // remainder?
      remx = (ime-ims+1) % XXX != 0 ? 1 : 0 ;
      remy = (jme-jms+1) % YYY != 0 ? 1 : 0 ;
      dim3 dimBlock( XXX , YYY ) ;
      //fprintf(stderr,"ime ims remx jme jms remy %d %d %d %d %d %d\n",ime,ims,remx,jme,jms,remy) ;
      dim3 dimGrid ( (ime-ims+1) / XXX + remx , (jme-jms+1) / YYY + remy ) ;
      //fprintf(stderr,"Call to gpu: block dims %d %d\n",dimBlock.x,dimBlock.y) ;
      //fprintf(stderr,"Call to gpu: grid  dims %d %d\n",dimGrid.x,dimGrid.y) ;

// this is needed because the CUDA_SAFE_CALL macro forces synchronous behavior
#undef CUDA_SAFE_CALL
#define CUDA_SAFE_CALL(A) A
#if 1

      ZTIME_E(3,"final_setup") ;

      for ( im = 0 ; im < *num_scalars ; im++ ) {

        p    = &(scalar[I4(0,0,dime,0,dscnd,im,dthrd)]) ;
        pold = &(scalar_old[I4(0,0,dime,0,dscnd,im,dthrd)]) ;

        if        ( im % NBUFFERS == 0 ) {
          p_d    = scalar_A_d ;
          pold_d = scalar_old_A_d ;
          stream = AA ;
        } else if ( im % NBUFFERS == 1 ) {
          p_d    = scalar_B_d ;
          pold_d = scalar_old_B_d ;
          stream = BB ;
        } 
#if (NBUFFERS == 3)
          else if ( im % NBUFFERS == 1 )  {
          p_d    = scalar_C_d ;
          pold_d = scalar_old_C_d ;
          stream = CC ;
        }
#endif

        parms3D.srcPtr.pitch = dime*sizeof(float) ;
        parms3D.srcPtr.xsize = dime*sizeof(float) ;
        parms3D.srcPtr.ysize = dscnd ;
        parms3D.srcPos.x = 0 ; parms3D.srcPos.y = 0 ; parms3D.srcPos.z = 0 ;
        parms3D.dstPos.x = 0 ; parms3D.dstPos.y = 0 ; parms3D.dstPos.z = 0 ;
        parms3D.extent.width = dime ; parms3D.extent.height = dscnd ; parms3D.extent.depth = dthrd ;
        parms3D.kind = cudaMemcpyHostToDevice ;

        parms3D.srcPtr.ptr = p ;
        parms3D.dstArray = p_d ;
        CUDA_SAFE_CALL(cudaMemcpy3DAsync(&parms3D, stream )) ;

        parms3D.srcPtr.ptr = pold ;
        parms3D.dstArray = pold_d ;
        CUDA_SAFE_CALL(cudaMemcpy3DAsync(&parms3D, stream )) ;

// Again, CPP trickery needed because the texture names are compile time constants
// and not changable at run time with pointers.  Doing it this way at least avoids
// having multiple copies of the source routines.
        if        ( im % NBUFFERS == 0 ) {
#         define MEMBER 1
#         include "parts.inc"
#         undef MEMBER
        } else if ( im % NBUFFERS == 1 ) {
#         define MEMBER 2
#         include "parts.inc"
#         undef MEMBER
        } 
#if (NBUFFERS == 3)
          else                 {
#         define MEMBER 3
#         include "parts.inc"
#         undef MEMBER
        }
#endif

    }

    CUDA_SAFE_CALL( cudaStreamSynchronize( AA ) ) ;
    CUDA_SAFE_CALL( cudaStreamSynchronize( BB ) ) ;
    CUDA_SAFE_CALL( cudaStreamSynchronize( CC ) ) ;

    ZTIME_S ;
    CUDA_SAFE_CALL( cudaThreadSynchronize() ) ;
    ZTIME_E(15, "cuda_thread_sync" ) ;
    ZTIME_S ;
#if 1
    if ( ! update_ro ) {
      if ( *rk_step == 1 ) {
//fprintf(stderr,"copying scalar_old\n") ;
        CUDA_SAFE_CALL( cudaMemcpy ( scalar_old , scalar_1_d, d4*sizeof(float), cudaMemcpyDeviceToHost ) ) ; 
      }
//fprintf(stderr,"copying scalar\n") ;
      CUDA_SAFE_CALL( cudaMemcpy ( scalar , scalar_2_d, d4*sizeof(float), cudaMemcpyDeviceToHost ) ) ; 
    }
#endif
    ZTIME_E(16, "copy_results_back_to_host" ) ;

# ifdef ZTIME_ON
    for ( j = 0 ; j<= 16 ; j++ ) { 
        fprintf(stderr,"ztime %4d %5d %s %d\n",j+1,z[0][j],label[j], z[1][j]) ;
    }
# endif
#endif
    CUDA_SAFE_CALL( cudaStreamDestroy( AA )) ;
    CUDA_SAFE_CALL( cudaStreamDestroy( BB )) ;
    CUDA_SAFE_CALL( cudaStreamDestroy( CC )) ;
  } // not first
    return(0) ;
}

int
GET_WRF_GPU_LEVELS ( int * retval )
{
    *retval = MKX ;  /* MKX is hard coded value set in the makefile */
    return(0) ;
}

} // extern C

