#ifndef NEW_APPROACH

#define ids ids_d
#define ide ide_d
#define jds jds_d
#define jde jde_d
#define kds kds_d
#define kde kde_d
#define ims ims_d
#define ime ime_d
#define jms jms_d
#define jme jme_d
#define kms kms_d
#define kme kme_d
#define ips ips_d
#define ipe ipe_d
#define jps jps_d
#define jpe jpe_d
#define kps kps_d
#define kpe kpe_d

extern "C" int rsl_internal_microclock_() ;

#include "radm2sorg_constant_mem.h"

extern "C" {

#include "radm2sorg_Jac_SP.code"

#include "radm2sorg_Fun.code"

#include "radm2sorg_ros_PrepareMatrix.code"

#include "radm2sorg_KppSolve.code"

#undef DEVICEEMU
void
__global__ radm2sorg_ros_Integrator( CHEM_FP_TYPE * jvvar  
#ifndef RCONST_IN_TEXTURE
  ,RCONST_TYPE * RCONST0_
#endif
  , CHEM_FP_TYPE * fix_, CHEM_FP_TYPE * AbsTol_, CHEM_FP_TYPE * RelTol_
  ,CHEM_FP_TYPE * B0_scratch_, CHEM_FP_TYPE * Ynew_, CHEM_FP_TYPE * Fcn0_, CHEM_FP_TYPE * Fcn_
  ,CHEM_FP_TYPE * dFdT_, CHEM_FP_TYPE * K_ , CHEM_FP_TYPE * Jac0_, CHEM_FP_TYPE * Ghimj_, CHEM_FP_TYPE * Yerr_
  , int * Pivot0 , int * IERR , int chunk_sz 
  ,CHEM_FP_TYPE *debuggal
)
{
  CHEM_FP_TYPE H, Hnew, HC, HG, Fac ;
  CHEM_FP_TYPE Err ;
  int Direction, ioffset, k, j,  istage ;
  int RejectLastH, RejectMoreH, Singular ;
  CHEM_FP_TYPE T = Tstart ;
  int tid ;

  tid = ti + bi * bx ;

  PERMUTE_TID ;

 if ( tid < chunk_sz ) {

//fprintf(stderr,"TID %d\n",tid) ;


#ifndef OLD_LAYOUT
  CHEM_FP_TYPE *jv         = (jvvar      + tid     ) ;  //NJV
  CHEM_FP_TYPE *Y          = (jv         + NJV*chunk_sz ) ;  // NVAR
# ifndef RCONST_IN_TEXTURE
  RCONST_TYPE *RCONST0    = (RCONST0_   + tid     ) ;  // NREACT
# endif
  CHEM_FP_TYPE *AbsTol     = (AbsTol_    + tid     ) ;  // NSPEC
  CHEM_FP_TYPE *RelTol     = (RelTol_    + tid     ) ;  // NSPEC
  CHEM_FP_TYPE *Ynew       = (Ynew_      + tid     ) ;   // NVAR
  CHEM_FP_TYPE *Fcn0       = (Fcn0_      + tid     ) ;   // NVAR
  CHEM_FP_TYPE *Fcn        = (Fcn_       + tid     ) ;   // NVAR
  CHEM_FP_TYPE *dFdT       = (dFdT_      + tid     ) ;   // NVAR
  CHEM_FP_TYPE *K          = (K_         + tid     ) ;   // 3*NVAR
  CHEM_FP_TYPE *Yerr       = (Yerr_      + tid     ) ;   // NVAR
  CHEM_FP_TYPE *fix        = (fix_       + tid     ) ;   // NFIX
  CHEM_FP_TYPE *Jac0       = (Jac0_      + tid     ) ;   // LU_NONZERO  ;
  CHEM_FP_TYPE *Ghimj      = (Ghimj_     + tid     ) ;   // LU_NONZERO  ;
  CHEM_FP_TYPE *B0_scratch = (B0_scratch_+ tid     ) ;   // SCRATCH_SIZE ;
  int *Pivot         = ( Pivot0    + tid     ) ;   // NVAR
#else
  CHEM_FP_TYPE *jv         = (jvvar   + tid *(NJV+NVAR+4) ) ; 
  CHEM_FP_TYPE *Y          = (jv      +NJV                ) ;  
# ifndef RCONST_IN_TEXTURE
  RCONST_TYPE *RCONST0    = (RCONST0_   + tid *NREACT  )  ;
# endif
  CHEM_FP_TYPE *AbsTol     = (AbsTol_    + tid *NSPEC   )  ;
  CHEM_FP_TYPE *RelTol     = (RelTol_    + tid *NSPEC   )  ;
  CHEM_FP_TYPE *Ynew       = (Ynew_      + tid *NVAR    ) ;
  CHEM_FP_TYPE *Fcn0       = (Fcn0_      + tid *NVAR    ) ;
  CHEM_FP_TYPE *Fcn        = (Fcn_       + tid *NVAR    ) ;
  CHEM_FP_TYPE *dFdT       = (dFdT_      + tid *NVAR    ) ;
  CHEM_FP_TYPE *K          = (K_         + tid *NVAR*3  ) ;
  CHEM_FP_TYPE *Yerr       = (Yerr_      + tid *NVAR    ) ;
  CHEM_FP_TYPE *fix        = (fix_       + tid *NFIX    )  ;
  CHEM_FP_TYPE *Jac0       = (Jac0_      + tid *LU_NONZERO ) ;
  CHEM_FP_TYPE *Ghimj      = (Ghimj_     + tid *LU_NONZERO ) ;
  CHEM_FP_TYPE *B0_scratch = (B0_scratch_+ tid *SCRATCH_SIZE     )  ;
  int *Pivot = ( Pivot0 + tid * NVAR ) ;
#endif

  int ierr ;

  ierr = 0 ;
  int iter1 = 0 ;
  int iter2= 0 ;

  // CHEM_FP_TYPE Hexit = ZERO ;

  H = min(Hstart,Hmax) ;
  if ( Tend >= Tstart ) { Direction = 1 ; }
  else                  { Direction = 0 ; }
  RejectLastH = 0 ;
  RejectMoreH = 0 ;

//!~~~> Time loop begins below

  while ( (( Direction > 0 ) && ((T-Tend)+Roundoff <= ZERO)) ||
          (( Direction < 0 ) && ((Tend-T)+Roundoff <= ZERO)) )
  {
//fprintf(stderr,"  solver ti %d ig %d bi %d bx %d tid %d chunk_sz %d\n",ti,ig,bi, bx, tid, chunk_sz) ;
    iter1++ ;

    if ( iter1 > 500 ) {
      *IERR = -6 ; // Too many steps 
      goto outwithyou ;
    }

    if ((T+0.1e0*H == T)||(H<=Roundoff)) {
//      IERR[ti+bi*bx] = -7 ; // Step size too small
      ierr = -7 ;
      goto outwithyou ;
    }
//!~~~>  Limit H if necessary to avoid going beyond Tend

     //Hexit = H ;
    H = min(H,abs(Tend-T)) ;
//!~~~>   Compute the function at current time

   radm2sorg_Fun(Y,fix
#ifndef RCONST_IN_TEXTURE
                 , RCONST0
#else
                 , tid
#endif
                 , Fcn0, B0_scratch, chunk_sz);

//!~~~>  Compute the function derivative with respect to T
// JM: note Autonomous is 1 for this problem, so this is not executed
   if (! Autonomous) {
      CHEM_FP_TYPE Delta = sqrt(Roundoff)*max(DeltaMin,abs(T)) ;
      radm2sorg_Fun(Y,fix
#ifndef RCONST_IN_TEXTURE
                    ,RCONST0
#else
                    ,tid
#endif
                    ,dFdT, B0_scratch, chunk_sz);
      WAXPY(NVAR,(-ONE),Fcn0,dFdT) ;
      WSCAL(NVAR,(ONE/Delta),dFdT) ;
   }

//!~~~>   Compute the Jacobian at current time

   radm2sorg_Jac_SP(Y,fix
#ifndef RCONST_IN_TEXTURE
                   ,RCONST0
#else
                   ,tid
#endif
                   , Jac0
                   , B0_scratch, chunk_sz ) ;

//!~~~>  Repeat step calculation until current step accepted
   while (1) {
     iter2++ ;
     if ( iter2 > 500 ) {
       *IERR = -12 ; // Too many steps 
       goto outwithyou ;
     }
     radm2sorg_ros_PrepareMatrix(&H,Direction,ros_Gamma[0], 
                                 Jac0,Ghimj,
                                 B0_scratch,
                                 Pivot,&Singular, chunk_sz ) ;
     if ( Singular ) { // More than 5 consecutive failed decompositions
//       IERR[ti+bi*bx] = -8 ;
      ierr = -8 ;
       goto outwithyou ;
     }

     for ( istage = 0 ; istage < 3 ; istage++ ) {
       // Current istage offset. Current istage vector is K(ioffset+1:ioffset+NVAR)
       ioffset = NVAR*(istage) ;
       // For the 1st istage the function has been computed previously
       if ( istage == 0 ) {
         WCOPY(NVAR,Fcn0,Fcn) ;
       } else if ( ros_NewF[istage] ) {
         WCOPY(NVAR,Y,Ynew) ;
	 for ( j = 0 ; j <= istage-1 ; j++ ) {
	   WAXPY(NVAR,ros_A[istage*(istage-1)/2+j],(&(K[MY_I(NVAR*j)])),Ynew) ;
	 }
	 //Tau = T + ros_Alpha[istage]*Direction*H ;
         radm2sorg_Fun(Ynew,fix
#ifndef RCONST_IN_TEXTURE
                      ,RCONST0
#else
                      ,tid
#endif
                      ,Fcn, B0_scratch, chunk_sz);
       }
//fprintf(stderr,"ioffset %d MY_I(ioffset) %d\n",ioffset,MY_I(ioffset)) ;
//fprintf(stderr,"addr of K %08d addr of K[MY_I(ioffset)] %08d, diff in words %d\n",K,&(K[MY_I(ioffset)]),(&(K[MY_I(ioffset)])-K)/8) ;
       WCOPY(NVAR,Fcn,(&(K[MY_I(ioffset)]))) ;
       for ( j = 0 ; j <= istage-1 ; j++ ) {
         HC = ros_C[(istage)*(istage-1)/2+j]/(Direction*H) ;
	 WAXPY(NVAR,HC,(&(K[MY_I(NVAR*(j))])),(&(K[MY_I(ioffset)]))) ;
       }
       if ((Autonomous == 0) && (ros_Gamma[istage]!=ZERO)) {
         // does not execute becaause Autonomous is 1
         HG = Direction*H*ros_Gamma[istage] ;
	 WAXPY(NVAR,HG,dFdT,(&(K[MY_I(ioffset)]))) ;
       }
       radm2sorg_KppSolve(Ghimj,(&(K[MY_I(ioffset)])), chunk_sz) ;
     }

//!~~~>  Compute the new solution
     WCOPY(NVAR,Y,Ynew) ;
     for ( j = 0 ; j < ros_S ; j++ ) {
       WAXPY(NVAR,ros_M[j],(&(K[MY_I(NVAR*j)])),Ynew) ;
     }
//!~~~>  Compute the error estimation
     WSCAL(NVAR,ZERO,Yerr) ;
     for ( j = 0 ; j < ros_S ; j++ ) {
       WAXPY(NVAR,ros_E[j],(&(K[MY_I(NVAR*j)])),Yerr) ;
     }
//    Err = radm2sorg_ros_ErrorNorm ( Y, Ynew, Yerr, AbsTol, RelTol, VectorTol )
     { CHEM_FP_TYPE Scale, Ymax ;
       int i ;
       Err = ZERO ;
       for ( i = 0 ; i < NVAR ; i++ ) {
         Ymax = max(abs(Y[MY_I(i)]),abs(Ynew[MY_I(i)])) ;
//	 if ( VectorTol ) {
	   Scale = AbsTol[MY_I(i)]+RelTol[MY_I(i)]*Ymax ;
//	 } else {
//	   Scale = AbsTol[1]+RelTol[1]*Ymax ;
//	 }
	 Err += (Yerr[MY_I(i)]/Scale)*(Yerr[MY_I(i)]/Scale) ;
       } 
       Err = sqrt(Err/NVAR) ;
     }
//!~~~> New step size is bounded by FacMin <= Hnew/H <= FacMax
     Fac  = min(FacMax,max(FacMin,FacSafe/pow((CHEM_FP_TYPE)Err,(CHEM_FP_TYPE)(ONE/ros_ELO)))) ;
     Hnew = H*Fac ;

//!~~~>  Check the error magnitude and adjust step size
     // Nstp = Nstp+1 ;

     if ( (Err <= ONE)||(H <= Hmin) ) {  //!~~~> Accept step
       WCOPY(NVAR,Ynew,Y) ;
       T = T+Direction*H ;
       Hnew = max(Hmin,min(Hnew,Hmax)) ;
       if ( RejectLastH ) {  //No step size increase after a rejected step
         Hnew = min(Hnew,H) ;
       }
       RejectLastH = 0 ;
       RejectMoreH = 0 ;
       H = Hnew ;
       break ; //EXIT THE LOOP: WHILE STEP NOT ACCEPTED
     } else { //!~~~> Reject step
       if ( RejectMoreH != 0 ) {
         Hnew = H*FacRej ;
       }
       RejectMoreH = RejectLastH ; 
       RejectLastH = 1 ;
       H = Hnew ;
     }
   }
 }
outwithyou:
// debuggal[tid*DEBUGGAL_SIZE ] = iter1 ;
// debuggal[tid*DEBUGGAL_SIZE + 1 ] = iter2 ;
 IERR[tid] = ierr ;

 } // guard
}

} // extern C


#else
///////////////////////////////////////////////////////////

#define ids ids_d
#define ide ide_d
#define jds jds_d
#define jde jde_d
#define kds kds_d
#define kde kde_d
#define ims ims_d
#define ime ime_d
#define jms jms_d
#define jme jme_d
#define kms kms_d
#define kme kme_d
#define ips ips_d
#define ipe ipe_d
#define jps jps_d
#define jpe jpe_d
#define kps kps_d
#define kpe kpe_d

extern "C" int rsl_internal_microclock_() ;

#include "radm2sorg_constant_mem.h"

#define SCRATCH_SIZE 560

#undef LOCSM
#define LOCSM(a,s) __shared__ CHEM_FP_TYPE a[s] ;
//#define LOCSM(a,s)  CHEM_FP_TYPE a[s] ;
#if 0
LOCSM(partial,NSPEC) ;
LOCSM(shared_AbsTol,NSPEC) ;
LOCSM(shared_RelTol,NSPEC) ;
LOCSM(shared_Y,NVAR) ;
LOCSM(shared_K,NVAR*3) ;
LOCSM(shared_Ynew,NVAR) ;
LOCSM(shared_Yerr,NVAR) ;
LOCSM(shared_Fcn,NVAR) ;
LOCSM(shared_Fcn0,NVAR) ;

#else
LOCSM(partial,64) ;
LOCSM(shared_AbsTol,64) ;
LOCSM(shared_RelTol,64) ;
LOCSM(shared_Y,64) ;
LOCSM(shared_K,64*3) ;
LOCSM(shared_Ynew,64) ;
LOCSM(shared_Yerr,64) ;
LOCSM(shared_Fcn,64) ;
LOCSM(shared_Fcn0,64) ;

#endif
//LOCSM(shared_dFdT,NSPEC) ;
LOCSM(shared_Jac,LU_NONZERO) ;
LOCSM(shared_Ghimj,LU_NONZERO) ;
LOCSM(shared_Rconst,NREACT) ;
LOCSM(shared_Fix,NFIX) ;
LOCSM(shared_B0_scratch,SCRATCH_SIZE) ;

#define STOICH(J,I) tex2D(Stoich_tex,J,I)
#define STOICH_LEFT(J,I) tex2D(Stoich_Left_tex,J,I)
#define STRUCTB(I,J) structB[(J)+(I)*(NVAR)]

extern "C" {

#include "radm2sorg_Jac_SP.code"

#include "radm2sorg_Fun.code"

#include "radm2sorg_ros_PrepareMatrix.code"

#include "radm2sorg_KppSolve.code"

#undef DEVICEEMU
void
__global__ radm2sorg_ros_Integrator( CHEM_FP_TYPE * jvvar  
  ,RCONST_TYPE * RCONST0_
  , CHEM_FP_TYPE * fix_, CHEM_FP_TYPE * AbsTol_, CHEM_FP_TYPE * RelTol_
  ,CHEM_FP_TYPE * B0_scratch_, CHEM_FP_TYPE * Ynew_, CHEM_FP_TYPE * Fcn0_, CHEM_FP_TYPE * Fcn_
  ,CHEM_FP_TYPE * dFdT_, CHEM_FP_TYPE * K_ , CHEM_FP_TYPE * Jac0_, CHEM_FP_TYPE * Ghimj_, CHEM_FP_TYPE * Yerr_
  , int * Pivot0 , int * IERR , int chunk_sz 
  ,CHEM_FP_TYPE *debuggal
)
{
  CHEM_FP_TYPE H, Hnew, HC, HG, Fac ;
  CHEM_FP_TYPE Err ;
  int Direction, ioffset, j,  istage, icell, tid ;
  int RejectLastH, RejectMoreH ;
  CHEM_FP_TYPE T ; 
  int i, ii ;


// chunk_sz is the number of grid cells given to the device.
// I have 1/30 of that
// I start at the first cell in my 1/30th and iterate through
// until I have no more cells

for ( icell = bi * chunk_sz / gridDim.x ; (icell < chunk_sz) && (icell < (bi+1)*chunk_sz/gridDim.x) ; icell ++ ) {

  T = Tstart ;

  int ierr = 0 ;
  int iter1 = 0 ;
  int iter2= 0 ;

  // CHEM_FP_TYPE Hexit = ZERO ;

  tid = icell ;

  CHEM_FP_TYPE *jv         = (jvvar   + tid *(NJV+NVAR+4) ) ;
  if ( ti < NVAR  ) shared_Y[ti]             = *(jv         + ti + NJV          ) ;
  if ( ti < NSPEC ) shared_AbsTol[ti]        = *(AbsTol_    + ti + tid *NSPEC   )  ;
  if ( ti < NSPEC ) shared_RelTol[ti]        = *(RelTol_    + ti + tid *NSPEC   )  ;
  if ( ti == 0 ) {
    for ( ii = 0 ; ii < NREACT ; ii++ ) {
      shared_Rconst[ii]      = *(RCONST0_   + ii + tid * NREACT   ) ;  // NREACT
    }
    for ( ii = 0 ; ii < NFIX ; ii++ ) {
      shared_Fix[ii]         = *(fix_       + ii + tid * NFIX     ) ;   // NFIX
    }

    for ( ii = 0 ; ii < NVAR ; ii++ ) {
      shared_Ynew[ii]         = 0. ;
      shared_Fcn[ii]         = 0. ;
      shared_Fcn0[ii]         = 0. ;
    }
    for ( ii = 0 ; ii < 3*NVAR ; ii++ ) {
      shared_K[ii]         = 0. ;
    }
    for ( ii = 0 ; ii < LU_NONZERO ; ii++ ) {
      shared_Jac[ii]         = 0. ;
      shared_Ghimj[ii]         = 0. ;
    }

  }

  H = min(Hstart,Hmax) ;
  if ( Tend >= Tstart ) { Direction = 1 ; }
  else                  { Direction = 0 ; }
  RejectLastH = 0 ;
  RejectMoreH = 0 ;

//!~~~> Time loop begins below

  

  while ( (( Direction > 0 ) && ((T-Tend)+Roundoff <= ZERO)) ||
          (( Direction < 0 ) && ((Tend-T)+Roundoff <= ZERO)) )
  {
    iter1++ ;

    if ( iter1 > 500 ) {
      *IERR = -6 ; // Too many steps 
      goto outwithyou ;
    }

    if ((T+0.1e0*H == T)||(H<=Roundoff) ) {
//      IERR[ti+bi*bx] = -7 ; // Step size too small
      *IERR = -7 ;
      goto outwithyou ;
    }
//!~~~>  Limit H if necessary to avoid going beyond Tend

     //Hexit = H ;
    H = min(H,abs(Tend-T)) ;
//!~~~>   Compute the function at current time

   radm2sorg_Fun(    shared_Y
                   , shared_Fix
                   , shared_Rconst
                   , shared_Fcn0
                   , shared_B0_scratch );

//DEBUG if ( ti < NVAR ) fprintf(stderr,"%e AAAA %d Y    stage %d ti %d \n",shared_Y[ti],ti,j,ti) ;
//DEBUG if ( ti < NVAR ) fprintf(stderr,"%e BBBB %d Fcn0 stage %d ti %d \n",shared_Fcn0[ti],ti,j,ti) ;

   radm2sorg_Jac_SP( shared_Y
                   , shared_Fix
                   , shared_Rconst
                   , shared_Jac
                   , shared_B0_scratch ) ;

//!~~~>  Repeat step calculation until current step accepted
   while (1) {
     iter2++ ;
     if ( iter2 > 500 ) {
       *IERR = -12 ; // Too many steps 
       goto outwithyou ;
     }

     radm2sorg_ros_PrepareMatrix( &H
                               , ros_Gamma[0]
                               , shared_Jac
                               , shared_Ghimj
                               , shared_B0_scratch
,debuggal
,0 // (ti == 0 && iter2 == 1)
                                ) ;

     if ( partial[ti] != 0. ) { // More than 5 consecutive failed decompositions
       ierr = -8 ;
       goto outwithyou ;
     }

     for ( istage = 0 ; istage < 3 ; istage++ ) {  // *******************************************
       // Current istage offset. Current istage vector is K(ioffset+1:ioffset+NVAR)
       ioffset = NVAR*(istage) ;
       // For the 1st istage the function has been computed previously
       if ( istage == 0 ) {
         WCOPY(NVAR,shared_Fcn0,shared_Fcn) ;
         __syncthreads() ;

       } else if ( ros_NewF[istage] ) {
         WCOPY(NVAR,shared_Y,shared_Ynew) ;

	 for ( j = 0 ; j <= istage-1 ; j++ ) {
	   WAXPY(NVAR,ros_A[istage*(istage-1)/2+j],shared_K,NVAR*j,shared_Ynew,0) ;
	 }

//DEBUG __syncthreads() ;
//DEBUGif ( ti == 0 ){ 
//DEBUG  for ( ii = 0 ; ii < NREACT ; ii++ ) { fprintf(stderr,"%e RCONST%1d %d\n",shared_Rconst[ii],istage+1,ii) ; }
//DEBUG  for ( ii = 0 ; ii < NFIX   ; ii++ ) { fprintf(stderr,"%e FIX%1d %d\n",shared_Fix[ii],istage+1,ii) ; }
//DEBUG}
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e C%1dC0 %d Ynew stage %d ti %d \n",shared_Ynew[ti],istage+1,ti,j,ti) ;

         radm2sorg_Fun( shared_Ynew
                      , shared_Fix
                      , shared_Rconst
                      , shared_Fcn
                      , shared_B0_scratch
                      );

//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e C%1dCC %d Ynew stage %d ti %d \n",shared_Ynew[ti],istage+1,ti,j,ti) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e D%1dDD %d Fcn stage %d ti %d \n",shared_Fcn[ti],istage+1,ti,j,ti) ;
       }

       WCOPY2(NVAR,shared_Fcn,0,shared_K,ioffset) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e F%1dF1 %d K  j %d ti %d \n",shared_K[ioffset+ti],istage+1,ti,j,ti) ;
       for ( j = 0 ; j <= istage-1 ; j++ ) {
         HC = ros_C[(istage)*(istage-1)/2+j]/(Direction*H) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e F%1d%1d0 %d K  j %d ti %d \n",shared_K[ioffset+ti],istage+1,j+1,ti,j,ti) ;
         WAXPY(NVAR,HC,shared_K,NVAR*j,shared_K,ioffset) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e E%1d%1dE %d K  j %d ti %d \n",shared_K[NVAR*j+ti],istage+1,j+1,ti,j,ti) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e F%1d%1dF %d K  j %d ti %d \n",shared_K[ioffset+ti],istage+1,j+1,ti,j,ti) ;
//DEBUG       __syncthreads() ;
       }

//DEBUG       __syncthreads() ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e F%1dF2 %d K  j %d ti %d \n",shared_K[ioffset+ti],istage+1,ti,j,ti) ;

// INLINE OF radm2sorg_KppSolve(shared_Ghimj,(&(shared_K[ioffset]))) ;
      __syncthreads() ;
       if ( ti == 0 ) {
         CHEM_FP_TYPE sum ;
         int i, j, jj ;
         for ( i = 0 ; i < NVAR ; i++ ) {
           for ( j = LU_CROW[i]-1 ; j <= LU_DIAG[i]-2 ; j++ ) {  // Arrays contain Fortran indices
             jj = LU_ICOL[j] - 1 ;
             shared_K[ioffset+i] -= (shared_Ghimj[j] * shared_K[ioffset + jj]) ;
           }
         }
         for ( i = NVAR-1 ; i >= 0 ; i-- ) {
           sum = shared_K[ioffset + i] ;
           for ( j = LU_DIAG[i] ; j <= LU_CROW[i+1]-2 ; j++ ) {
             jj = LU_ICOL[j] - 1 ;
             sum -= (shared_Ghimj[j] * shared_K[ioffset + jj]) ;
           }
           ii = LU_DIAG[i] - 1 ;
           shared_K[ioffset+i] = sum / shared_Ghimj[ ii ] ;
         }
       }
       __syncthreads() ;
//DEBUG       __syncthreads() ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e F%1dF3 %d K  j %d ti %d \n",shared_K[ioffset+ti],istage+1,ti,j,ti) ;


     } //  stage loop *******************************************

//!~~~>  Compute the new solution
     WCOPY(NVAR,shared_Y,shared_Ynew) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e GGG1 %d Y  j %d ti %d \n",shared_Y[ti],ti,j,ti) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e GGG2 %d Ynew  j %d ti %d \n",shared_Ynew[ti],ti,j,ti) ;
//DEBUG__syncthreads() ;
     for ( j = 0 ; j < ros_S ; j++ ) {
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e G%1dG3 %d K  j %d ti %d \n",shared_K[j*NVAR+ti],j+1,ti,j,ti) ;
//DEBUG__syncthreads() ;
       WAXPY(NVAR,ros_M[j],shared_K,NVAR*j,shared_Ynew,0) ;
//DEBUGif ( ti < NVAR ) fprintf(stderr,"%e G%1dG4 %d Ynew  j %d ti %d \n",shared_Ynew[ti],j+1,ti,j,ti) ;
//DEBUG__syncthreads() ;
     }
//!~~~>  Compute the error estimation
     WSCAL(NVAR,ZERO,shared_Yerr) ;
     for ( j = 0 ; j < ros_S ; j++ ) {
       WAXPY(NVAR,ros_E[j],shared_K,NVAR*j,shared_Yerr,0) ;
     }

// INLINE of   Err = radm2sorg_ros_ErrorNorm ( Y, Ynew, Yerr, AbsTol, RelTol, VectorTol )

     { CHEM_FP_TYPE Scale, Ymax ;
       int i, ii ;
       Err = ZERO ;
       if ( ti < NVAR ) {
         i = ti ;
         partial[i] = 0 ;
         Ymax = max(abs(shared_Y[i]),abs(shared_Ynew[i])) ;
         Scale = shared_AbsTol[i]+shared_RelTol[i]*Ymax ;
         partial[i] = (shared_Yerr[i]/Scale)*(shared_Yerr[i]/Scale) ;
       }
       __syncthreads() ;
       if ( ti == 0 ) {
         for (ii = 0 ; ii < NVAR ; ii++ ) {
           Err += partial[ii] ;
         }
       }
       __syncthreads() ;
       if ( ti == 0 ) {
         for (ii = 0 ; ii < NVAR ; ii++ ) {
           partial[ii] = Err ;
         }
       }

       __syncthreads() ;

       if ( ti < NVAR ) {
         Err = partial[ti] ;
         Err = sqrt(Err/NVAR) ;
       } else {
         Err = partial[NVAR-1] ;
         Err = sqrt(Err/NVAR) ;
       }
     }
     __syncthreads() ;

//!~~~> New step size is bounded by FacMin <= Hnew/H <= FacMax
     Fac  = min(FacMax,  max(FacMin,  FacSafe/pow( (CHEM_FP_TYPE)Err,  (CHEM_FP_TYPE)(ONE/ros_ELO) )) ) ;
     Hnew = H*Fac ;

//!~~~>  Check the error magnitude and adjust step size
     // Nstp = Nstp+1 ;

     if ( (Err <= ONE)||(H <= Hmin) ) {  //!~~~> Accept step
       WCOPY(NVAR,shared_Ynew,shared_Y) ;
       T = T+Direction*H ;
       Hnew = max(Hmin,min(Hnew,Hmax)) ;
       if ( RejectLastH ) {  //No step size increase after a rejected step
         Hnew = min(Hnew,H) ;
//DEBUGif (ti == 0 )fprintf(stderr,"++ RejectLastH iter1 %d iter2 %e H %e Hnew %e\n",iter1,iter2,H,Hnew) ;
       }
       RejectLastH = 0 ;
       RejectMoreH = 0 ;
       H = Hnew ;
       break ; //EXIT THE LOOP: WHILE STEP NOT ACCEPTED
     } else { //!~~~> Reject step
       if ( RejectMoreH != 0 ) {
         Hnew = H*FacRej ;
       }
       RejectMoreH = RejectLastH ; 
       RejectLastH = 1 ;
       H = Hnew ;
     }
//DEBUG  __syncthreads() ;
//DEBUGif (ti == 0 )fprintf(stderr,"** iter1 %d iter2 %d T %e H %e Fac %e Err %e\n",iter1,iter2,T,H,Fac,Err) ;
   }
       __syncthreads() ;
//DEBUGif (ti == 0 )fprintf(stderr,"iter1 %d iter2 %d T %e H %e Fac %e Err %e\n",iter1,iter2,T,H,Fac,Err) ;
 }
  *IERR = 0 ;

outwithyou:

//DEBUG if (ti < NVAR )fprintf(stderr,"Done: icell %d i1 %d i2 %d T %e Err %e Y[%d] %e\n",icell,iter1,iter2,T,Err,ti,shared_Y[ti]) ;
__syncthreads() ;
if ( ti == 0 ) { ii = debuggal[bi] ; debuggal[bi] = ii + iter2 ; ii = debuggal[bi+120] ; debuggal[bi+120] = ii + 1 ; }
  if ( ti < NVAR ) *(jv         + ti + NJV ) = shared_Y[ti] ;
//if ( ti == 0 ) fprintf(stderr,"cell %d bi %d iter1 %d iter2 %d ierr %d %e\n",icell,bi,iter1, iter2,*IERR, *(jv + ti + NJV) ) ;

}

 } // guard

} // extern C

///////////////////////////////////////////////////////////

#endif
