SHELL = sh
##################### intel compiler #############
#ECPP     = icc
#CC      = icc
#FC      = ifort
#CFLAGS  = -w -O3 -ip  -c
#FCFLAGS = -align all -FR -convert big_endian #-fno-alias -fno-fnalias
#OPT     =  -O0 -g
#OPT     =  -O3 -fpe0
#LD      = $(FC)
#LDOPT   = 
#XXX = 32
#CHUNK_SIZE = 1521
#CPP = /lib/cpp
##################### gcc/gfortran ###############
ECPP     = g++
CC      = gcc
FC      = gfortran 
FCFLAGS = -fconvert=big-endian -frecord-marker=4 -ffree-form
OPT     =  -O0 -g
OPT     =  -O3
LD      = $(FC)
LDOPT   = 
ARCH    = sm_10
XXX = 32
CPP = /lib/cpp
##################### pgi on mac compiler #############
#ECPP     = pgcc
#CC      = pgcc
#FC      = pgf90
#CFLAGS  = -w -O3  -c
#FCFLAGS = -byteswapio -w 
#OPT     =  -O3
#LD      = $(FC)
#LDOPT   =
#XXX = 32
#CHUNK_SIZE = 1521
#CPP = cpp
##################################################
# do not change this definition, change the one further down
FLOAT   = float
RWORDSIZE = 4
##################### cuda location ##############
# eces-shell
#CUDALIBPATH =  ~/emu/cuda/lib
# ncsa
CUDALIBPATH =  /usr/local/cuda/lib
LIBCUBLAS = $(CUDALIBPATH)/libcublas.so
LIBCUBLASEMU = $(CUDALIBPATH)/libcublasemu.so


########################  THIS SECTION YOU CAN CHANGE ##################
#
# Hard coded number of levels  (35 for conus, 28 for jan00)
MKX = 35

# uncomment this to use FLOAT4 data type (optimization)
#FLOAT4 = -DFLOAT_4=4 

# this must always be defined but they do not do anything unless
# other settings are set
#DEBUG_I = 71
#DEBUG_J = 3
#DEBUG_K = 1
#DEBUGOUTPUT = -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K) $(FLOAT4)

# uncomment to run on emulator instead of the device
#DEVICEEMU       = -DDEVICEEMU
#DEVICEEMU_NVCC  = -deviceemu $(DEVICEEMU)
#LIBCUBLAS = $(LIBCUBLASEMU)

# uncomment to output detailed debug data output
# must have DEVICEEMU settings uncommented above
#DEBUGOUTPUT = -DDEBUGOUTPUT -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K) $(FLOAT4)

# uncomment to allow settings to force closer agreement
#DEBUGDEBUG = -DDEBUGDEBUG

## uncomment for single precision on GPU
#CHEM_FP_TYPE = float
#ARCH    = sm_10
## uncomment for double (default) precision on GPU (must be gtx280 or later)
CHEM_FP_TYPE = double
ARCH    = sm_13

MAXRREGCOUNT = 73   # allow 224 thread blocks but will compile?  (Yes, but does not run)
MAXRREGCOUNT = 85   # allow 192 thread blocks

CCCCC   = -I. -DRUN_ON_GPU_CHEM $(DEVICEEMU) $(PROMOTE) $(DEBUGDEBUG) $(DEBUGOUTPUT) \
          -DMKX=$(MKX) -DXXX=$(XXX) -DYYY=$(YYY) -DCHUNK_SIZE=$(CHUNK_SIZE) -DCHEM_FP_TYPE=$(CHEM_FP_TYPE) \
          #-DREROLL_FCN #-DREROLL_SECOND_PART #-DRCONST_IN_TEXTURE #-DOLD_LAYOUT #-DSTOICH_LEFT_VAR_MAJOR

#CCCCC   = -I. -DNEW_APPROACH -DRUN_ON_GPU_CHEM $(DEVICEEMU) $(PROMOTE) $(DEBUGDEBUG) $(DEBUGOUTPUT) \
#          -DMKX=$(MKX) -DXXX=$(XXX) -DYYY=$(YYY) -DCHUNK_SIZE=$(CHUNK_SIZE) -DSM_SIZE=$(SM_SIZE) -DCHEM_FP_TYPE=$(CHEM_FP_TYPE) \
#          -DREROLL_FCN -DREROLL_SECOND_PART -DOLD_LAYOUT #-DRCONST_IN_TEXTURE #-DSTOICH_LEFT_VAR_MAJOR

########################  END OF SECTION YOU CAN CHANGE ################

INCLUDE_MODULES = #-I../WRFV3/main -I../WRFV3/frame -I../WRFV3/dyn_em -I../WRFV3/share
VANILLA_LIBS =

#NVCC    = nvcc -DCUDA
#NVCC    = nvcc -DCUDA -ccbin /usr/bin
#NVOPT   = --keep -I. $(DEVICEEMU_NVCC) $(PROMOTE) $(DEBUGDEBUG) $(DEBUGOUTPUT) -DMKX=$(MKX) -arch $(ARCH) --host-compilation 'C++' -DXXX=$(XXX) -DYYY=$(YYY) -DCHUNK_SIZE=$(CHUNK_SIZE) --ptxas-options --opt-level=0 --verbose #-DZTIME_ON 
NVCC = nvcc -DCUDA --host-compilation C -c -arch sm_11 \
       --keep --compiler-options -fno-strict-aliasing -I. -I/home/michalak/cuda/include/ -I/home/michalak/NVIDIA_CUDA_SDK/common/inc/ \
       -L/home/michalak/NVIDIA_CUDA_SDK/lib/ -L/home/michalak/NVIDIA_CUDA_SDK/lib -lcutil -DUNIX  $(CCCCC)

#NVOPT   = $(CCCCC) $(DEVICEEMU_NVCC) --keep -arch $(ARCH) --host-compilation 'C++'  --verbose \
#          --ptxas-options --maxrregcount=$(MAXRREGCOUNT)

.SUFFIXES :

#all : vanilla chocolate
#all : vanilla
all : chocolate

chem_host.cu.o : chem_host.cu solver_gpu.cu radm2sorg_Fun.code \
                 radm2sorg_KppSolve.code radm2sorg_Jac_SP.code  radm2sorg_ros_PrepareMatrix.code
	m4 chem_host.cu > y.cu
	$(CPP) -I. -DCUDA -C -P $(CCCCC) y.cu > chem_host.E
	$(NVCC) $(NVOPT) y.cu
	/bin/mv y.o  chem_host.cu.o

y.ptxinfo : y.ptx
	./gen_ptxinfo.sh

choc chocolate : chem_driver.F chem_host.cu.o microclock.o y.ptxinfo
	$(CPP) -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) -DRUN_ON_GPU_CHEM \
                                   chem_driver.F > chem_driver.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) chem_driver.f90
	$(LD) -o chem_driver_chocolate $(LDOPT) chem_driver.o chem_host.cu.o microclock.o \
                  -L$(GPGPUSIM)/libcuda -lcuda \
                  -L$(NVIDIA_CUDA_SDK_LOCATION)/lib/ -lcutil \
                  -L$(GPGPUSIM)/src -lgpgpusim \
                  -L$(GPGPUSIM)/src/intersim -lintersim \
                  -L$(GPGPUSIM)/src/cuda-sim -lgpgpu_ptx_sim \
                  -lm -lz -lGL \
                  -lstdc++  

microclock.o : microclock.c
	$(CC) -c $(CFLAGS) -DMKX=$(MKX) microclock.c

module_data_radm2.o : module_data_radm2.F 
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_data_radm2.F > module_data_radm2.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_data_radm2.f90

module_data_sorgam.o : module_data_sorgam.F module_data_radm2.o
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_data_sorgam.F > module_data_sorgam.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_data_sorgam.f90

module_kpp_radm2sorg_Update_Rconst.o : module_kpp_radm2sorg_Update_Rconst.F
	$(CPP) -C -P -I. -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_Update_Rconst.F > module_kpp_radm2sorg_Update_Rconst.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_Update_Rconst.f90

module_kpp_radm2sorg_Precision.o : module_kpp_radm2sorg_Precision.F 
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_Precision.F > module_kpp_radm2sorg_Precision.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_Precision.f90

module_kpp_radm2sorg_Parameters.o : module_kpp_radm2sorg_Parameters.F module_kpp_radm2sorg_Precision.o
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_Parameters.F > module_kpp_radm2sorg_Parameters.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_Parameters.f90

module_wkppc_constants.o : module_wkppc_constants.F
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_wkppc_constants.F > module_wkppc_constants.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_wkppc_constants.f90

module_kpp_radm2sorg_JacobianSP.o : module_kpp_radm2sorg_JacobianSP.F
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_JacobianSP.F > module_kpp_radm2sorg_JacobianSP.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_JacobianSP.f90

module_kpp_radm2sorg_Jacobian.o : module_kpp_radm2sorg_Jacobian.F \
                                  module_kpp_radm2sorg_JacobianSP.o module_kpp_radm2sorg_Parameters.o
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_Jacobian.F > module_kpp_radm2sorg_Jacobian.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_Jacobian.f90

module_kpp_radm2sorg_Integr.o : module_kpp_radm2sorg_Integr.F module_kpp_radm2sorg_Jacobian.o
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_Integr.F > module_kpp_radm2sorg_Integr.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_Integr.f90

module_kpp_radm2sorg_interface.o : module_kpp_radm2sorg_interface.F \
                                   module_kpp_radm2sorg_Precision.o \
                                   module_kpp_radm2sorg_Parameters.o \
                                   module_data_radm2.o module_data_sorgam.o \
                                   module_kpp_radm2sorg_Update_Rconst.o \
                                   module_wkppc_constants.o \
                                   module_kpp_radm2sorg_Integr.o
	$(CPP) -C -P -DRWORDSIZE=$(RWORDSIZE) \
                                   module_kpp_radm2sorg_interface.F > module_kpp_radm2sorg_interface.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) module_kpp_radm2sorg_interface.f90
	
van vanilla : chem_driver.F module_kpp_radm2sorg_interface.o module_kpp_radm2sorg_Integr.o microclock.o
	$(CPP) -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) \
                                   chem_driver.F > chem_driver.f90
	$(FC) -c $(OPT) $(FCFLAGS) $(INCLUDE_MODULES) chem_driver.f90
	$(LD) -o chem_driver_vanilla $(LDOPT) chem_driver.o \
                                              module_kpp_radm2sorg_interface.o \
                                              module_kpp_radm2sorg_Integr.o \
                                              module_kpp_radm2sorg_Precision.o \
                                              module_kpp_radm2sorg_Parameters.o \
                                              module_data_radm2.o module_data_sorgam.o \
                                              module_kpp_radm2sorg_Update_Rconst.o \
                                              module_kpp_radm2sorg_Jacobian.o \
                                              module_kpp_radm2sorg_JacobianSP.o \
                                              module_wkppc_constants.o \
                                              microclock.o


clean clena :
	\rm -f *.o *.cu.c x.* y.* *.mod *.f90 x.ptx *.cpp chem_driver_chocolate chem_driver_vanilla chem_host.E
	\rm -f run/*.ptx run/*.ptxinfo

