#define ASYNC

#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <cublas.h>
#include "cutil.h"
#include <string.h>


#define EXTERN_RO extern
#define EXTERN_RF extern
#include "chem_host.h"
#define DEBUGGAL_SIZE (8*NVAR+LU_NONZERO)
#include "P_defines.h"
#undef EXTERN_RF
#undef EXTERN_RO

#ifndef CRAY
# ifdef NOUNDERSCORE
#      define CHEM_DRIVER_ON_HOST chem_driver_on_host
#      define GPU_INIT gpu_init
# else
#   ifdef F2CSTYLE
#      define CHEM_DRIVER_ON_HOST chem_driver_on_host__
#      define GPU_INIT gpu_init__
#   else
#      define CHEM_DRIVER_ON_HOST chem_driver_on_host_
#      define GPU_INIT gpu_init_
#   endif
# endif
#endif

#define I2(i,j,m) ((i)+((j)*(m)))
#define I3(i,j,m,k,n) (I2(i,j,m)+((k)*(m)*(n)))
#define I4(i,j,m,k,n,l,o) (I3(i,j,m,k,n)+((l)*(m)*(n)*(o)))

# define TODEV(A,s) static float *A##_d;if (first) {cudaMalloc((void**)&A##_d,((s))*sizeof(float));};cudaMemcpy(A##_d,A,(s)*sizeof(float),cudaMemcpyHostToDevice);
//# define TODEV(A,s) if (first) {cudaMalloc((void**)&(X_h[X_curs]),((s))*sizeof(float));};cudaMemcpy(X_h[X_curs++],A,(s)*sizeof(float),cudaMemcpyHostToDevice);
# define FROMDEV(A,s) cudaMemcpy(A,X_h[X_curs++],(s)*sizeof(float),cudaMemcpyDeviceToHost);
//# define FROMDEV(A,s) cudaMemcpy(A,X_d[X_curs++],(s)*sizeof(float),cudaMemcpyDeviceToHost);
//# define CLNUP(A) cudaFree(A##_d)

# define TODEV4(A) TODEV(A,d4)
# define TODEV3(A) TODEV(A,d3)
# define TODEV2(A) TODEV(A,d2)
# define FROMDEV3(A) FROMDEV(A,d3)
# define FROMDEV2(A) FROMDEV(A,d2)

#if defined(RCONST_IN_TEXTURE)
# define RCONST_TYPE float
#endif

#if !defined(RCONST_TYPE) && !defined(RCONST_IN_TEXTURE)
# define RCONST_TYPE CHEM_FP_TYPE
#endif

# include "spt.h"
//#undef CUDA_SAFE_CALL
//#define CUDA_SAFE_CALL(A) A

extern "C" int rsl_internal_microclock_() ;


//----------------------------------------------------------------------------
//                 D E V I C E   C O D E 
//----------------------------------------------------------------------------
// SPTSTART

#include "radm2sorg_constant_mem.h"

#include "solver_gpu.cu"

extern "C" {

#if 1
int
GPU_INIT ( )
{
   float x, *x_d ;
   int s, e ;
   s=rsl_internal_microclock_() ;
   cudaMalloc((void **)&x_d,sizeof(float)) ;
   cudaMemcpy(x_d,&x,sizeof(float),cudaMemcpyHostToDevice) ;
   cudaFree(x_d) ;
   e=rsl_internal_microclock_() ;
   fprintf(stderr,"gpu_init: %d\n",e-s) ;
   return(0) ;
}
#endif

CHEM_FP_TYPE
#ifndef REORDER_EXEC
__device__ __noinline__ 
#endif
k46 (CHEM_FP_TYPE TEMP, CHEM_FP_TYPE C_M) {
COUNT_OPS(6,2,6,0,4,0,0,0) ;
   return((7.2e-15 * exp(785.e0/(TEMP)))+                           
    (1.9e-33 * exp(725.e0/(TEMP))*(C_M))/                       
    (1+(1.9e-33 * exp(725.e0/(TEMP))  * (C_M))/(4.1e-16 * exp(1440.e0/(TEMP))))) ; }

CHEM_FP_TYPE
__device__ __noinline__ TROE (CHEM_FP_TYPE k0_300K,CHEM_FP_TYPE n,CHEM_FP_TYPE kinf_300K,CHEM_FP_TYPE m,CHEM_FP_TYPE temp,CHEM_FP_TYPE cair)
{ CHEM_FP_TYPE zt_help, k0_T, kinf_T, k_ratio, ret ;
COUNT_OPS(5,1,3,0,2,3,0,0) ;
  zt_help = 300.e0/(temp) ;
  k0_T    = (k0_300K)   * pow(zt_help,(n)) * (cair) ;
  kinf_T  = (kinf_300K) * pow(zt_help,(m)) ;
  k_ratio = k0_T/(kinf_T) ;
  return(k0_T/(1.e0+(k_ratio))*pow(0.6e0,(1.e0/(1.e0+log10(k_ratio)*log10(k_ratio))))) ; }

CHEM_FP_TYPE
__device__ __noinline__ TROEE( CHEM_FP_TYPE A, CHEM_FP_TYPE B, CHEM_FP_TYPE k0_300K,
                               CHEM_FP_TYPE n,CHEM_FP_TYPE kinf_300K,CHEM_FP_TYPE m,CHEM_FP_TYPE temp,CHEM_FP_TYPE cair) {
COUNT_OPS(2,1,1,0,0,0,0,0) ;
   return(((A)*exp(-(B)/(temp))*TROE((k0_300K),(n),(kinf_300K),(m),(temp),(cair)))) ;
}

CHEM_FP_TYPE
#ifndef REORDER_EXEC
__device__ __noinline__ 
#endif
ARR2( CHEM_FP_TYPE A0 , CHEM_FP_TYPE B0 , CHEM_FP_TYPE TEMP )
{ 
COUNT_OPS(0,0,1,0,1,0,0,0) ;
  return(A0 * exp(-B0/TEMP)) ; }

CHEM_FP_TYPE
#ifndef REORDER_EXEC
__device__ __noinline__ 
#endif
THERMAL_T2(CHEM_FP_TYPE c,CHEM_FP_TYPE d,CHEM_FP_TYPE temp) {
COUNT_OPS(2,1,1,0,1,1,0,0) ;
        return(pow((CHEM_FP_TYPE)(temp),(CHEM_FP_TYPE)2.e0)*(c)*exp(-(d)/(temp))) ; }

void 
__global__ radm2sorg_setup_gpu (
    CHEM_FP_TYPE * jvvar  
  , RCONST_TYPE * RCONST0_
  , CHEM_FP_TYPE * fix_, CHEM_FP_TYPE * AbsTol_, CHEM_FP_TYPE * RelTol_
  , CHEM_FP_TYPE * B0_scratch_
#ifndef OLD_LAYOUT
  , CHEM_FP_TYPE * rho_phy , CHEM_FP_TYPE * t_phy , CHEM_FP_TYPE * p_phy , CHEM_FP_TYPE * qv
#endif
  , float rc_n2o5_0   , float dtstepc , int chunk_sz
)
{
   int i, k, ierr ;

   int tid = ti + bi * bx ; 

   PERMUTE_TID ;

//fprintf(stderr,"  setup ti %d  bi %d bx %d tid %d chunk_sz %d\n",ti,bi, bx, tid, chunk_sz) ;

 if ( tid < chunk_sz ) {

#ifndef OLD_LAYOUT
   CHEM_FP_TYPE *jv         = (jvvar      + tid     ) ;  //NJV
   CHEM_FP_TYPE *var        = (jv         + NJV*chunk_sz ) ;  // NVAR
   RCONST_TYPE *RCONST0    = (RCONST0_   + tid     ) ;  // NREACT
   CHEM_FP_TYPE *AbsTol     = (AbsTol_    + tid     ) ;  // NSPEC
   CHEM_FP_TYPE *RelTol     = (RelTol_    + tid     ) ;  // NSPEC
   CHEM_FP_TYPE *fix        = (fix_       + tid     ) ;   // NFIX
#else
   CHEM_FP_TYPE *jv         = (jvvar     +      tid     *(NJV+NVAR+4) ) ;
   CHEM_FP_TYPE *var        = (jv                          +NJV     ) ;
   RCONST_TYPE *RCONST0    = (RCONST0_    + tid *NREACT  ) ;
   CHEM_FP_TYPE *rho_phy    = (var                         +NVAR    ) ;
   CHEM_FP_TYPE *t_phy      = (rho_phy                     +1       ) ;
   CHEM_FP_TYPE *p_phy      = (t_phy                       +1       ) ;
   CHEM_FP_TYPE *qv         = (p_phy                       +1       ) ;
   CHEM_FP_TYPE *fix        = (fix_        + tid *NFIX    ) ;
   CHEM_FP_TYPE *AbsTol     = (AbsTol_     + tid *NSPEC   ) ;
   CHEM_FP_TYPE *RelTol     = (RelTol_     + tid *NSPEC   ) ;
#endif
   CHEM_FP_TYPE *B0_scratch = (B0_scratch_ + tid *SCRATCH_SIZE     ) ;

   CHEM_FP_TYPE c_mitemp ;

   float navgrdo = 6.022e23 ;
   float mwh = 1.0079 ;
   float mwo = 15.9994 ;
   float mwair = 28.97 ;
   float mwh2o = 2*mwh + mwo ;
   float dens2con_a = 1.e-3 * (1./mwair) * navgrdo ;   // kg/m3 -> g/cm3
   float dens2con_w = 1.e-3 * (1./mwh2o) * navgrdo ;   // kg/m3 -> g/cm3

   CHEM_FP_TYPE atols = 1. ;
   CHEM_FP_TYPE rtols = 1.e-3 ;

   int d2 = (ipe-ips+1)*(jpe-jps+1) ;

   for ( i = 0 ; i < NSPEC ; i++ ) {
     AbsTol[MY_I(i)] = atols ;
     RelTol[MY_I(i)] = rtols ;
   }

   CHEM_FP_TYPE c_m, temp, conv, oconv ;
   CHEM_FP_TYPE rho, q, pres ;

#ifndef OLD_LAYOUT
   rho  = rho_phy[tid] ;
   temp = t_phy[tid] ;
   q    = qv[tid] ;
   pres = p_phy[tid] ;
#else
   rho  = *rho_phy ;
   temp = *t_phy ;
   q    = *qv ;
   pres = *p_phy ;
#endif

      // 3rd body concentration (molec/cm^3)
   fix[MY_I(indf_M)]  = dens2con_a * rho ; 
   c_m = fix[MY_I(indf_M)] ;

      // water concentration (molec/cm^3)
   fix[MY_I(indf_H2O)] = dens2con_w * q * rho ;

      // conversion from ppmV to molecules/cm3 and back
   conv=1.e-6*dens2con_a* rho ;
#ifdef DEVICEEMU 
if ( tid == 0 ) fprintf(stderr," W A R N I N G   :  HARDWIRED VALUE OF RHO_PHY FOR DEBUGGING\n") ;
if ( tid == 0 ) rho = 0.116835856438E+01 ;
if ( tid == 0 ) conv = 0.242866972696E+14 ;
#endif
   oconv = 1.e0/conv ;


#ifdef DEVICEEMU
if ( tid == 0 ) fprintf(stderr,"navgrdo = %20.12e\n",navgrdo) ;
if ( tid == 0 ) fprintf(stderr,"mwair = %20.12e\n",mwair) ;
if ( tid == 0 ) fprintf(stderr,"dens2con_a = %20.12e\n",dens2con_a) ;
if ( tid == 0 ) fprintf(stderr,"rho_phy = %20.12e\n",*rho_phy) ;
if ( tid == 0 ) fprintf(stderr,"conv = %20.12e\n",conv) ;
#endif

   for ( i = 0 ; i < NJV  ; i++ ) jv[MY_I(i)] = jv[MY_I(i)] / 60. ;
   for ( i = 0 ; i < NVAR ; i++ ) var[MY_I(i)] = conv * MAX(var[MY_I(i)],0.) ;

//fprintf(stderr,"p_phy %e\n",pres) ;
//fprintf(stderr,"t_phy %e\n",temp ) ;
//fprintf(stderr,"qv    %e\n",q) ;
   CHEM_FP_TYPE es,qvs,rh,rc_n2o5 ;
   es  = 1000.*0.6112*exp(17.67*(temp-273.15)/(temp- 29.65)) ;
   qvs = es / ( pres - es ) ;
   rh =  q / qvs ;
   rh = min ( max ( rh, 0.), 1.) ;
//fprintf(stderr,"rh %e\n",rh) ;
   rc_n2o5 =  1.0 / ( 3.6E4 * exp( -pow( ( rh / 0.28 ) , 2.8 ) ) + 300.0 ) ;

# ifdef DEVICEEMU
if ( tid == 0 ) fprintf(stderr,"rc_n2o5 %e\n",rc_n2o5) ;
# endif

#define RCONST(I) RCONST0[MY_I((I)-1)]
#include "radm2sorg_Update_RCONST.h"
#undef RCONST

 } // guard
}

void
#ifndef REORDER_EXEC
__global__ 
#endif
radm2sorg_restore_gpu (
    CHEM_FP_TYPE * jvvar
#ifndef OLD_LAYOUT
  , CHEM_FP_TYPE * rho_phy
#endif
  , int chunk_sz
)
{
   int i, k, ierr ;

   int tid = ti + bi * bx ;

   PERMUTE_TID ;

 if ( tid < chunk_sz ) {

#ifndef OLD_LAYOUT
   CHEM_FP_TYPE *jv         = (jvvar      + tid     ) ;  //NJV
   CHEM_FP_TYPE *var        = (jv         + NJV*chunk_sz ) ;  // NVAR
   CHEM_FP_TYPE rho         = rho_phy[tid] ;
#else
   CHEM_FP_TYPE *jv         = (jvvar     + tid *(NJV+NVAR+4) ) ;
   CHEM_FP_TYPE *var        = (jv        + NJV     ) ;
   CHEM_FP_TYPE *rho_phy    = (var       + NVAR    ) ;
   CHEM_FP_TYPE rho = *rho_phy ;
#endif

   CHEM_FP_TYPE c_mitemp ;

   float navgrdo = 6.022e23 ;
   float mwh = 1.0079 ;
   float mwo = 15.9994 ;
   float mwair = 28.97 ;
   float mwh2o = 2*mwh + mwo ;
   float dens2con_a = 1.e-3 * (1./mwair) * navgrdo ;   // kg/m3 -> g/cm3
   float dens2con_w = 1.e-3 * (1./mwh2o) * navgrdo ;   // kg/m3 -> g/cm3

   CHEM_FP_TYPE c_m, conv, oconv ;

      // conversion from ppmV to molecules/cm3 and back
   conv=1.e-6*dens2con_a* rho ;
   oconv = 1.e0/conv ;

   for ( i = 0 ; i < NVAR ; i++ ) {
      var[MY_I(i)] = MAX(0.e0,oconv * var[MY_I(i)] ) ;
#ifdef DEVICEEMU
//fprintf(stderr,"restored var %3d: %e\n",i+1,var[MY_I(i)])  ;
#endif
   }

 } // guard

}

} // extern C

//----------------------------------------------------------------------------
//                     H O S T   C O D E 
//----------------------------------------------------------------------------

extern "C" {

#define BOUNDS(V,S) V##_lb =  V ; V##_ub =  V##_lb + (S) * chunk_sz  ;


int 
CHEM_DRIVER_ON_HOST (  float * chem, float * rc_n2o5, float *dtstepc
                     , float * rho_phy, float * qv, float * t_phy, float * p_phy
                     , float * addt, float * addx, float * addc, float * etep, float * oltp
                     , float * olip, float * cslp, float * limp, float * hc5p, float * hc8p
                     , float * tolp, float * xylp, float * apip, float * isop, float * hc3p
                     , float * ethp, float * o3p, float * tco3, float * mo2, float * o1d 
                     , float * olnn, float * olnd, float * rpho, float * xo2, float * ketp
                     , float * xno2, float * ol2p, float * oln, float * macp               
                     , float * ph_o31d, float * ph_o33p, float * ph_no2, float * ph_no3o2, float * ph_no3o
                     , float * ph_hno2, float * ph_hno3, float * ph_hno4, float * ph_h2o2, float * ph_ch2or
                     , float * ph_ch2om, float * ph_ch3cho, float * ph_ch3coch3, float * ph_ch3coc2h5, float * ph_hcocho
                     , float * ph_ch3cocho, float * ph_hcochest, float * ph_ch3o2h, float * ph_ch3coo2h, float * ph_ch3ono2
                     , float * ph_hcochob, float * ph_macr, float * ph_n2o5, float * ph_o2, float * ph_pan
                     , float * ph_acet, float * ph_mglo, float * ph_hno4_2
                     , int * ids0, int * ide0, int *  jds0, int * jde0, int *  kds0, int * kde0
                     , int * ims0, int * ime0, int *  jms0, int * jme0, int *  kms0, int * kme0
                     , int * ips0, int * ipe0, int *  jps0, int * jpe0, int *  kps0, int * kpe0 
                     , int * chunk_sz0 , int * XXX0 
          )
{

fprintf(stderr,"*dtstepc %e\n",*dtstepc) ;


      int i, j, k ;
      static int first = 1 ;
      int ids=*ids0-1; int ide=*ide0-1; int jds=*jds0-1; int jde=*jde0-1; int kds=*kds0-1; int kde=*kde0-1 ;
      int ims=*ims0-1; int ime=*ime0-1; int jms=*jms0-1; int jme=*jme0-1; int kms=*kms0-1; int kme=*kme0-1 ;
      int ips=*ips0-1; int ipe=*ipe0-1; int jps=*jps0-1; int jpe=*jpe0-1; int kps=*kps0-1; int kpe=*kpe0-1 ;

//            argument types are: (int, int, unsigned long, int, cudaMemcpyKind)

      int s0, e0 ;
      int s4, e4 ;
      s0 = rsl_internal_microclock_() ;
      s4 = s0 ;
      int s = sizeof(int) ;

      int d2 = (ime-ims+1)*(jme-jms+1) ;
      int d3 = d2*(kme-kms+1) ;
      int d4 = d3 * (82) ;

      int p2 = (ipe-ips+1)*(jpe-jps+1) ;
      int p3 = p2*(kpe-kps+1) ;
      int p4 = p3 * (82) ;

      int ichunk, icell ;
      int chunk_sz ;
      int xxx ;

#if 1
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("ids_d",&ids,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("ide_d",&ide,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("jds_d",&jds,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("jde_d",&jde,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("kds_d",&kds,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("kde_d",&kde,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("ims_d",&ims,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("ime_d",&ime,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("jms_d",&jms,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("jme_d",&jme,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("kms_d",&kms,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("kme_d",&kme,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("ips_d",&ips,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("ipe_d",&ipe,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("jps_d",&jps,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("jpe_d",&jpe,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("kps_d",&kps,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("kpe_d",&kpe,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("chunk_sz_d",&chunk_sz,sizeof(int),0,cudaMemcpyHostToDevice) ) ;
#endif

#include "radm2sorg_SP.h"
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("LU_IROW",&LU_IROW_h,659*sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("LU_ICOL",&LU_ICOL_h,659*sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("LU_CROW",&LU_CROW_h,60*sizeof(int),0,cudaMemcpyHostToDevice) ) ;
      CUDA_SAFE_CALL( cudaMemcpyToSymbol("LU_DIAG",&LU_DIAG_h,60*sizeof(int),0,cudaMemcpyHostToDevice) ) ;


fprintf(stderr,"%s %d\n",__FILE__,__LINE__) ;
fprintf(stderr,"ids ide jds jde kds kde %d %d %d %d %d %d\n",ids,ide,jds,jde,kds,kde) ;
fprintf(stderr,"ims ime jms jme kms kme %d %d %d %d %d %d\n",ims,ime,jms,jme,kms,kme) ;
fprintf(stderr,"ips ipe jps jpe kps kpe %d %d %d %d %d %d\n",ips,ipe,jps,jpe,kps,kpe) ;
fprintf(stderr,"d4 %d d3 %d \n",d4,d3) ;
      TODEV(addt,d3) ;
      TODEV(addx,d3) ;
      TODEV(addc,d3) ;
      TODEV(etep,d3) ;
      TODEV(cslp,d3) ;
      TODEV(limp,d3) ;
      TODEV(apip,d3) ;
      TODEV(isop,d3) ;
      TODEV(olnn,d3) ;
      TODEV(olnd,d3) ;
      TODEV(rpho,d3) ;
      TODEV(xno2,d3) ;
      TODEV(macp,d3) ;

      chunk_sz = *chunk_sz0 ;
      xxx = * XXX0 ;

      int remx = min(1,chunk_sz % xxx) ;
      dim3 dimBlock( xxx  ) ;
      dim3 dimGrid ( chunk_sz / xxx + remx ) ;
#ifndef NEW_APPROACH
      dim3 dimBlock2( xxx  ) ;
      dim3 dimGrid2 ( chunk_sz / xxx + remx ) ;
#else
      dim3 dimBlock2( 64  ) ;
      dim3 dimGrid2 ( 120 ) ;    // number of MPs
#endif

      My_Nb = chunk_sz / xxx + remx ;
#ifdef REORDER_EXEC
      bx = xxx ;
#endif

      fprintf(stderr,"Call to gpu: chunk_size = %d \n",chunk_sz ) ;
      fprintf(stderr,"Call to gpu: block dims %d %d\n",dimBlock.x,dimBlock.y) ;
      fprintf(stderr,"Call to gpu: grid  dims %d %d\n",dimGrid.x,dimGrid.y) ;

#ifndef OLD_LAYOUT

      CHEM_FP_TYPE *jvvar_h   ; if (first) { jvvar_h = (CHEM_FP_TYPE*)malloc( (NJV+NVAR)*chunk_sz*sizeof(CHEM_FP_TYPE)) ; }
      {int iii ; for (iii=0;iii<(NJV+NVAR)*chunk_sz;iii++){jvvar_h[iii]=0.;}}
      CHEM_FP_TYPE *jvvar_d   ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(jvvar_d),(NJV+NVAR)*chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
BOUNDS(jvvar_d,(NJV+NVAR)) ;


      CHEM_FP_TYPE *rho_phy_h ; if (first) { rho_phy_h = (CHEM_FP_TYPE*)malloc( chunk_sz*sizeof(CHEM_FP_TYPE)) ; }
      CHEM_FP_TYPE *t_phy_h   ; if (first) { t_phy_h = (CHEM_FP_TYPE*)malloc( chunk_sz*sizeof(CHEM_FP_TYPE)) ; }
      CHEM_FP_TYPE *p_phy_h   ; if (first) { p_phy_h = (CHEM_FP_TYPE*)malloc( chunk_sz*sizeof(CHEM_FP_TYPE)) ; }
      CHEM_FP_TYPE *qv_h      ; if (first) { qv_h = (CHEM_FP_TYPE*)malloc( chunk_sz*sizeof(CHEM_FP_TYPE)) ; }

      CHEM_FP_TYPE *rho_phy_d ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(rho_phy_d),chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *t_phy_d   ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(t_phy_d),chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *p_phy_d   ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(p_phy_d),chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *qv_d      ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(qv_d),chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
#else
//      CHEM_FP_TYPE *jvvar_h   ; if (first) { CUDA_SAFE_CALL(cudaMallocHost((void **)&(jvvar_h),(NJV+NVAR+4)*chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *jvvar_h   ; if (first) { jvvar_h = (CHEM_FP_TYPE*)malloc( (NJV+NVAR+4)*chunk_sz*sizeof(CHEM_FP_TYPE)) ; }
      {int iii ; for (iii=0;iii<(NJV+NVAR+4)*chunk_sz;iii++){jvvar_h[iii]=0.;}}
      CHEM_FP_TYPE *jvvar_d   ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(jvvar_d),(NJV+NVAR+4)*chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
#endif

      RCONST_TYPE *RCONST0 ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(RCONST0), NREACT*chunk_sz*sizeof(RCONST_TYPE))) ; }
#ifdef RCONST_IN_TEXTURE
      CUDA_SAFE_CALL( cudaBindTexture(0,RCONST0_tex,RCONST0,NREACT*chunk_sz*sizeof(RCONST_TYPE)) ) ;
#endif
      CHEM_FP_TYPE *fix        ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(fix    ), NFIX  *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *AbsTol     ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(AbsTol ), NSPEC *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *RelTol     ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(RelTol ), NSPEC *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *B0_scratch ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(B0_scratch), SCRATCH_SIZE *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *Ynew       ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(Ynew   ), NVAR *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *Fcn0       ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(Fcn0   ), NVAR *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *Fcn        ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(Fcn    ), NVAR *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *K          ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(K      ), ROS_S*NVAR *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *dFdT       ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(dFdT   ), NVAR *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *Jac0       ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(Jac0   ), LU_NONZERO *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *Ghimj      ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(Ghimj  ), LU_NONZERO *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE *Yerr       ; if (first) { CUDA_SAFE_CALL(cudaMalloc((void **)&(Yerr   ), NVAR *chunk_sz*sizeof(CHEM_FP_TYPE))) ; }

      int   *Pivot_d    ;  if (first) { CUDA_SAFE_CALL(cudaMalloc( (void **)&(Pivot_d), NVAR * chunk_sz * sizeof(int))) ; }
      int   *IERR_d     ;  if (first) { CUDA_SAFE_CALL(cudaMalloc( (void **)&(IERR_d), chunk_sz * sizeof(int))) ; }
      int   *IERR = (int *)malloc(chunk_sz*sizeof(int)) ;
      CHEM_FP_TYPE   *debuggal_d     ;  if (first) { CUDA_SAFE_CALL(cudaMalloc( (void **)&(debuggal_d), DEBUGGAL_SIZE*sizeof(CHEM_FP_TYPE))) ; }
      CHEM_FP_TYPE   *debuggal = (CHEM_FP_TYPE *)malloc(DEBUGGAL_SIZE*chunk_sz*sizeof(CHEM_FP_TYPE)) ;

      cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat) ;
      static cudaArray * Stoich_Left_d ;
      if ( first ) {CUDA_SAFE_CALL(cudaMallocArray( &(Stoich_Left_d) , &channelDesc , NSPEC , NREACT ));}

      static cudaArray * Stoich_d ;
      if ( first ) {CUDA_SAFE_CALL(cudaMallocArray( &(Stoich_d) , &channelDesc , NSPEC , NREACT ));}

#ifdef REROLL_SECOND_PART
      int * LUstructJ_bitmask ; if (first) LUstructJ_bitmask = (int *) malloc( (NVAR*NVAR/INTBITS + 1) *sizeof(int)) ;
      for (i=0;i<(NVAR*NVAR/INTBITS+1);i++) { LUstructJ_bitmask[i] = 0 ; }
#endif

#ifdef REROLL_FCN
      int * used_in_fun_h ; if ( first ) used_in_fun_h = (int *) malloc((NREACT/INTBITS+1)*sizeof(int)) ;
      for (i=0;i<(NREACT/INTBITS+1);i++) { used_in_fun_h[i] = 0 ; }
#endif

      first = 0 ;

      CHEM_FP_TYPE Tstart_h = 0. ;
      CHEM_FP_TYPE Tend_h   = *dtstepc ;
      CHEM_FP_TYPE Hmin_h = ZERO ;
      CHEM_FP_TYPE Hmax_h = abs(Tend_h-Tstart_h) ;
      CHEM_FP_TYPE Hstart_h = min(2.4,abs(Tend_h-Tstart_h))  ;
      CHEM_FP_TYPE FacMin_h = 0.2 ;
      CHEM_FP_TYPE FacMax_h = 6.0 ;
      CHEM_FP_TYPE FacRej_h = 0.1 ;
      CHEM_FP_TYPE FacSafe_h = 0.9 ;
      CHEM_FP_TYPE Roundoff_h = 2.220446049250313E-016 ;
      int Max_no_steps_h = 100000 ;
      int Autonomous_h = 1 ;
      int VectorTol_h = 1 ;

      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Tstart",&Tstart_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Tend",&Tend_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Hmin",&Hmin_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Hmax",&Hmax_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Hstart",&Hstart_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;

      CUDA_SAFE_CALL(cudaMemcpyToSymbol("FacMin",&FacMin_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("FacMax",&FacMax_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("FacRej",&FacRej_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("FacSafe",&FacSafe_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Roundoff",&Roundoff_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
//      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Max_no_steps",&Max_no_steps_h,sizeof(int),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("Autonomous",&Autonomous_h,sizeof(int),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("VectorTol",&VectorTol_h,sizeof(int),0,cudaMemcpyHostToDevice)) ;

// Inline of SUBROUTINE radm2sorg_Ros3

//!~~~> The coefficient matrices A and C are strictly lower triangular.
//!   The lower triangular (subdiagonal) elements are stored in row-wise order:
//!   A(2,1) = ros_A(1), A(3,1)=ros_A(2), A(3,2)=ros_A(3), etc.
//!   The general mapping formula is:
//!       A(i,j) = ros_A( (i-1)*(i-2)/2 + j )
//!       C(i,j) = ros_C( (i-1)*(i-2)/2 + j )
      int ros_S_h = ROS_S ;
      CHEM_FP_TYPE ros_A_h[3] ;
      ros_A_h[0] = 1.0e0 ;
      ros_A_h[1] = 1.0e0 ;
      ros_A_h[2] = 0.0e0 ;
      CHEM_FP_TYPE ros_C_h[3] ;
      ros_C_h[0] = -0.10156171083877702091975600115545e01 ;
      ros_C_h[1] =  0.40759956452537699824805835358067e01 ;
      ros_C_h[2] =  0.92076794298330791242156818474003e01 ;
//!~~~> Does the stage i require a new function evaluation (ros_NewF(i)=TRUE)
//!   or does it re-use the function evaluation from stage i-1 (ros_NewF(i)=FALSE)
      int ros_NewF_h[3] ;
      ros_NewF_h[0] = 1 ;
      ros_NewF_h[1] = 1 ; 
      ros_NewF_h[2] = 0 ;
//!~~~> M_i = Coefficients for new step solution
      CHEM_FP_TYPE ros_M_h[3] ;
      ros_M_h[0] =  0.1e01 ;
      ros_M_h[1] =  0.61697947043828245592553615689730e01 ;
      ros_M_h[2] = -0.42772256543218573326238373806514e00 ;
//! E_i = Coefficients for error estimator
      CHEM_FP_TYPE ros_E_h[3] ;
      ros_E_h[0] =  0.5e0 ;
      ros_E_h[1] = -0.29079558716805469821718236208017e01 ;
      ros_E_h[2] =  0.22354069897811569627360909276199e00 ;
//!~~~> ros_ELO = estimator of local order - the minimum between the
//!    main and the embedded scheme orders plus 1
      CHEM_FP_TYPE ros_ELO_h ;
      ros_ELO_h = 3.0e0 ;
//!
//!~~~> Y_stage_i ~ Y( T + H*Alpha_i )
      CHEM_FP_TYPE ros_Alpha_h[3] ;
      ros_Alpha_h[0]= 0.0e0 ;
      ros_Alpha_h[1]= 0.43586652150845899941601945119356e0 ;
      ros_Alpha_h[2]= 0.43586652150845899941601945119356e0 ;
//!~~~> Gamma_i = \sum_j  gamma_{i,j}
      CHEM_FP_TYPE ros_Gamma_h[3] ;
      ros_Gamma_h[0]= 0.43586652150845899941601945119356e0 ;
      ros_Gamma_h[1]= 0.24291996454816804366592249683314e0 ;
      ros_Gamma_h[2]= 0.21851380027664058511513169485832e01 ;

      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_S",&ros_S_h,sizeof(int),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_NewF",ros_NewF_h,3*sizeof(int),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_ELO",&ros_ELO_h,sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_A",ros_A_h,3*sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_C",ros_C_h,3*sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_M",ros_M_h,3*sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_E",ros_E_h,3*sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_Alpha",ros_Alpha_h,3*sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("ros_Gamma",ros_Gamma_h,3*sizeof(CHEM_FP_TYPE),0,cudaMemcpyHostToDevice)) ;

#ifdef REROLL_SECOND_PART
// these are all zeros and ones -- compress down to a bitmask so it fits in constant mem
# include "LUstructJ.h"
      for ( i = 0 ; i < NVAR*NVAR ; i++ ) {
        LUstructJ_bitmask[i/INTBITS] |= ( LUstructJ_h[i] << (i%INTBITS) ) ;
      }
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("LUstructJ",LUstructJ_bitmask,
                    (NVAR*NVAR/INTBITS+1)*sizeof(int),0,cudaMemcpyHostToDevice)) ;
#endif

#include "structB.h"
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("structB",structB_h,9204*sizeof(int),0,cudaMemcpyHostToDevice)) ;

#include "Stoich.h"
      { CUDA_SAFE_CALL(cudaMemcpy2DToArray( Stoich_d, 0, 0, Stoich_h, NSPEC*sizeof(float),
                     NSPEC*sizeof(float), NREACT, cudaMemcpyHostToDevice )) ;
        Stoich_tex.addressMode[0] = cudaAddressModeClamp ;
        Stoich_tex.addressMode[1] = cudaAddressModeClamp ;
        Stoich_tex.filterMode = cudaFilterModePoint ;
        Stoich_tex.normalized = false ;
        CUDA_SAFE_CALL( cudaBindTextureToArray( Stoich_tex, Stoich_d, channelDesc ) ) ; }

#include "Stoich_Left.h"
      { CUDA_SAFE_CALL(cudaMemcpy2DToArray( Stoich_Left_d, 0, 0, Stoich_Left_h, NSPEC*sizeof(float),
                     NSPEC*sizeof(float), NREACT, cudaMemcpyHostToDevice )) ;
        Stoich_Left_tex.addressMode[0] = cudaAddressModeClamp ;
        Stoich_Left_tex.addressMode[1] = cudaAddressModeClamp ;
        Stoich_Left_tex.filterMode = cudaFilterModePoint ;
        Stoich_Left_tex.normalized = false ;
        CUDA_SAFE_CALL( cudaBindTextureToArray( Stoich_Left_tex, Stoich_Left_d, channelDesc ) ) ; }

#ifdef REROLL_FCN
// precompute this and store as a bitmask in constant mem
      for ( j = 0 ; j < NREACT ; j++ ) {
        for ( i = 0 ; i < NVAR ; i++ ) {
          if ( Stoich_h[i+j*NSPEC] != 0. ) { used_in_fun_h[j/INTBITS] |= (1 << j % INTBITS) ; break ; }
        }
      }
      CUDA_SAFE_CALL(cudaMemcpyToSymbol("used_in_fun",used_in_fun_h,
                    (NREACT/INTBITS+1)*sizeof(int),0,cudaMemcpyHostToDevice)) ;
#endif

      e0 = rsl_internal_microclock_() ;
      fprintf(stderr,"setup : %d microseconds\n",e0-s0) ;

//  END SUBROUTINE  radm2sorg_Ros3

      int n ;
      int nchunk ;
      CHEM_FP_TYPE *p ;
#ifndef NEW_APPROACH
      for ( nchunk = 0 ; ; nchunk++ ) {
#else
      for ( nchunk = 0 ; nchunk == 0  ; nchunk++ ) {
#endif

        ichunk = nchunk * chunk_sz ;
        n = min( chunk_sz , (ipe-ips+1)*(jpe-jps+1)*(kpe-kps+1)-ichunk ) ;

        if ( n <= 0 ) break ;  /* EXIT FROM LOOP */

        int s1,e1 ;
        s1 = rsl_internal_microclock_() ;
        p = jvvar_h ;
        for ( icell = ichunk ; icell < ichunk + n ; icell++ ) {
          k = kps + icell / ((ipe-ips+1)*(jpe-jps+1)) ;
          j = jps + icell % ((ipe-ips+1)*(jpe-jps+1)) / (ipe-ips+1) ;
          i = ips + icell % ((ipe-ips+1)) ;
          int idx2 = (i-ims) + (j-jms) * ( ime-ims+1 ) ;  // index into source IJ arrays
          int idx3 = (i-ims) + k * ( ime-ims+1 ) + (j-jms) * ( ime-ims+1 ) *( kme-kms+1 ) ;  // index into IKJ source arrays
#define TO_AND_FRO(A,B)  A = (CHEM_FP_TYPE) (B)
#define JV_TOO
#include "pack_jvvar.inc"
#ifndef OLD_LAYOUT
          p ++ ;
#else
          p += (NJV+NVAR+4) ;
#endif
        }
        e1 = rsl_internal_microclock_() ;
        fprintf(stderr,"Packing chunk %d : %d microseconds\n",nchunk+1 , e1-s1) ;

#ifndef OLD_LAYOUT
        CUDA_SAFE_CALL(cudaMemcpy(jvvar_d,jvvar_h,((NJV+NVAR) * n * sizeof(CHEM_FP_TYPE)),cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy(rho_phy_d,rho_phy_h, n * sizeof(CHEM_FP_TYPE),cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy(t_phy_d  ,t_phy_h  , n * sizeof(CHEM_FP_TYPE),cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy(p_phy_d  ,p_phy_h  , n * sizeof(CHEM_FP_TYPE),cudaMemcpyHostToDevice));
        CUDA_SAFE_CALL(cudaMemcpy(qv_d     ,qv_h     , n * sizeof(CHEM_FP_TYPE),cudaMemcpyHostToDevice));
#else
        CUDA_SAFE_CALL(cudaMemcpy(jvvar_d,jvvar_h,((NJV+NVAR+4) * n * sizeof(CHEM_FP_TYPE)),cudaMemcpyHostToDevice));
#endif

        int s2,e2 ;
        s2 = rsl_internal_microclock_() ;

     { int e, d ; 
           for (e=0; e<DEBUGGAL_SIZE ; e++) { 
               debuggal[e] = 0. ;
           }

        CUDA_SAFE_CALL(cudaMemcpy(debuggal_d,debuggal,  DEBUGGAL_SIZE * sizeof(CHEM_FP_TYPE),cudaMemcpyHostToDevice));

        fprintf(stderr,"calling radm2sorg_setup_gpu\n") ;
        radm2sorg_setup_gpu <<< dimGrid, dimBlock >>> (
                                     jvvar_d, RCONST0, fix, AbsTol, RelTol, B0_scratch
#ifndef OLD_LAYOUT
                                    ,rho_phy_d ,  t_phy_d ,  p_phy_d ,  qv_d
#endif
                                    ,*rc_n2o5, *dtstepc, n
             ) ;


        cudaThreadSynchronize() ;
        fprintf(stderr,"calling radm2sorg_ros_Integrator, n = %d\n",n) ;

        radm2sorg_ros_Integrator <<< dimGrid2, dimBlock2 >>> (
                                     jvvar_d
#ifndef RCONST_IN_TEXTURE
                                   , RCONST0
#endif
                                   , fix, AbsTol, RelTol, B0_scratch
                                   , Ynew, Fcn0, Fcn, dFdT, K, Jac0, Ghimj, Yerr
                                   , Pivot_d, IERR_d, n
,debuggal_d
        ) ;

        cudaThreadSynchronize() ;
        CUDA_SAFE_CALL(cudaMemcpy(IERR,IERR_d, chunk_sz * sizeof(int),cudaMemcpyDeviceToHost));
        CUDA_SAFE_CALL(cudaMemcpy(debuggal,debuggal_d,  DEBUGGAL_SIZE * sizeof(CHEM_FP_TYPE),cudaMemcpyDeviceToHost));
           //for (e=0; e<chunk_sz ; e++) { 
           for (e=0; e==0 ; e++) { 
             if ( IERR[e] != 0 ) {
               fprintf(stderr,"radm2sorg_ros_Integrator returns error %d at thread %d\n",IERR[e],e);
             }
           }

#if 0 
fprintf(stderr,"-------------------------------------\n") ;
           for (e=0; e<240 ; e++) { 
             fprintf(stderr,"radm2sorg_ros_Integrator debuggal[%d] %e\n",e,debuggal[e]);
           }
fprintf(stderr,"-------------------------------------\n") ;
#endif

        cudaThreadSynchronize() ;
        fprintf(stderr,"calling radm2sorg_restore_gpu\n") ;
        radm2sorg_restore_gpu <<< dimGrid, dimBlock >>> (
                                     jvvar_d
#ifndef OLD_LAYOUT
                                    ,rho_phy_d
#endif
                                    ,n
             ) ;
     }

        cudaThreadSynchronize() ;
        e2 = rsl_internal_microclock_() ;
        fprintf(stderr,"Call to chem (not including data xfer): %d microseconds\n",e2-s2) ;

#ifndef OLD_LAYOUT
        CUDA_SAFE_CALL(cudaMemcpy(jvvar_h,jvvar_d,((NJV+NVAR) * n * sizeof(CHEM_FP_TYPE)),cudaMemcpyDeviceToHost));
#else
        CUDA_SAFE_CALL(cudaMemcpy(jvvar_h,jvvar_d,((NJV+NVAR+4) * n * sizeof(CHEM_FP_TYPE)),cudaMemcpyDeviceToHost));
#endif

        int s3,e3 ;
        s3 = rsl_internal_microclock_() ;
        p = jvvar_h ;
        for ( icell = ichunk ; icell < ichunk + n ; icell++ ) {
          k = icell / ((ipe-ips+1)*(jpe-jps+1)) + kps ;
          j = icell % ((ipe-ips+1)*(jpe-jps+1)) / (ipe-ips+1) + jps ;
          i = icell % ((ipe-ips+1)) + ips ;
          //fprintf(stderr,"i %3d j %3d k %3d icell %3d ichunk %3d\n",i,j,k,icell,ichunk) ;
          int idx2 = (i-ims) + (j-jms) * ( ime-ims+1 ) ;  // index into source IJ arrays
          int idx3 = (i-ims) + k * ( ime-ims+1 ) + (j-jms) * ( ime-ims+1 ) *( kme-kms+1 ) ;  // index into IKJ source arrays
#define TO_AND_FRO(A,B) B = (float) (A) 
#undef JV_TOO
#include "pack_jvvar.inc"
#ifndef OLD_LAYOUT
          p ++ ;
#else
          p += (NJV+NVAR+4) ;
#endif
        }
        e3 = rsl_internal_microclock_() ;
        fprintf(stderr,"Unpacking chunk %d : %d microseconds\n",nchunk+1 , e3-s3) ;

      }
      e4 = rsl_internal_microclock_() ;
//      fprintf(stderr,"Call to chem (all inclusive): %d microseconds\n",e4-s4) ;

#ifdef ENABLE_OPCOUNT
      fprintf(stderr,"COUNTERS %u,%u,%u,%u,%u,%u,%u,%u\n",MUL_,ADD_,DIV_,SQT_,EXP_,POW_,LD_,ST_) ;
#endif

}


} // extern C

