SHELL = csh ##################### intel compiler ############# CC = icc FC = ifort CFLAGS = -w -O3 -ip -c FCFLAGS = -align all -FR -convert big_endian -fno-alias -fno-fnalias OPT = -O0 -g -C -traceback OPT = -O3 LD = $(FC) LDOPT = -g XXX = 32 YYY = 8 ##################### gcc/gfortran ############### #CC = gcc #FC = gfortran #FCFLAGS = -fconvert=big-endian -frecord-marker=4 -ffree-form #OPT = -O3 #LD = $(FC) #LDOPT = ################################################## # do not change this definition, change the one further down FLOAT = float RWORDSIZE = 4 ##################### cuda location ############## # eces-shell #CUDALIBPATH = ~/emu/cuda/lib # ncsa CUDALIBPATH = /usr/local/cuda/lib LIBCUBLAS = $(CUDALIBPATH)/libcublas.so LIBCUBLASEMU = $(CUDALIBPATH)/libcublasemu.so ######################## THIS SECTION YOU CAN CHANGE ################## # # Hard coded number of levels (35 for conus, 28 for jan00) MKX = 28 # uncomment this to use FLOAT4 data type (optimization) #FLOAT4 = -DFLOAT_4=4 # this must always be defined but they do not do anything unless # other settings are set DEBUG_I = 59 DEBUG_J = 45 DEBUG_K = 1 DEBUGOUTPUT = -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K) $(FLOAT4) # uncomment to run on emulator instead of the device #DEVICEEMU = -DDEVICEEMU #DEVICEEMU_NVCC = -deviceemu $(DEVICEEMU) #LIBCUBLAS = $(LIBCUBLASEMU) # uncomment to output detailed debug data output # must have DEVICEEMU settings uncommented above #DEBUGOUTPUT = -DDEBUGOUTPUT -DDEBUG_I=$(DEBUG_I) -DDEBUG_J=$(DEBUG_J) -DDEBUG_K=$(DEBUG_K) $(FLOAT4) # uncomment to allow settings to force closer agreement #DEBUGDEBUG = -DDEBUGDEBUG # uncomment for to promote to 8 byte floats # note, if you do this without DEVICE EMU above, compiler will complain it does not have enough shared mem #PROMOTE = -DPROMOTE #FLOAT = double #FCFLAGS = -fconvert=big-endian -frecord-marker=4 -fdefault-real-8 #RWORDSIZE = 8 ######################## END OF SECTION YOU CAN CHANGE ################ #NVCC = nvcc -DCUDA NVCC = nvcc -DCUDA -ccbin /usr/bin #--ptxas-options=-v PHASE = -ptx PHASE = -cuda NVOPT = $(DEVICEEMU_NVCC) $(PROMOTE) $(DEBUGDEBUG) $(DEBUGOUTPUT) -DXXX=$(XXX) -DYYY=$(YYY) -DMKX=$(MKX) --use_fast_math .SUFFIXES : #all : vanilla chocolate compare_snaps all : vanilla compare_snaps swrad.cu.o : swrad.cu m4 swrad.cu | sed "s/float/$(FLOAT)/g" > y.cu $(NVCC) $(PHASE) $(NVOPT) y.cu /bin/mv y.cu.cpp swrad.cu.cpp $(CC) $(CFLAGS) -c swrad.cu.cpp swrad_gpu.cu.o : swrad_gpu.cu spt.h m4 swrad_gpu.cu | spt.pl | sed "s/float/$(FLOAT)/g" > x.cu $(NVCC) $(PHASE) $(NVOPT) x.cu /bin/mv x.cu.cpp swrad_gpu.cu.cpp $(CC) $(CFLAGS) -c swrad_gpu.cu.cpp microclock.o : microclock.c $(CC) -c $(CFLAGS) -DMKX=$(MKX) microclock.c module_ra_sw.o : module_ra_sw.F /lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) module_ra_sw.F > module_ra_sw.f90 $(FC) -c $(OPT) $(FCFLAGS) module_ra_sw.f90 driver_swrad.o : driver_swrad.F module_ra_sw.o /lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRWORDSIZE=$(RWORDSIZE) driver_swrad.F > driver_swrad.f90 $(FC) -c $(OPT) $(FCFLAGS) driver_swrad.f90 vanilla : module_ra_sw.o microclock.o driver_swrad.o $(LD) -o swrad_driver_vanilla $(LDOPT) driver_swrad.o module_ra_sw.o microclock.o $(LIBCUBLASEMU) chocolate : module_ra_sw.F swrad.cu.o swrad_gpu.cu.o driver_swrad.o microclock.o /lib/cpp -C -P $(DEBUGDEBUG) $(DEVICEEMU) $(DEBUGOUTPUT) -DRUN_ON_GPU -DRWORDSIZE=$(RWORDSIZE) module_ra_sw.F > module_ra_sw.f90 $(FC) -c $(OPT) $(FCFLAGS) module_ra_sw.f90 $(LD) -o swrad_driver_chocolate $(LDOPT) driver_swrad.o module_ra_sw.o swrad.cu.o swrad_gpu.cu.o microclock.o $(LIBCUBLAS) compare_snaps : compare_snaps.F \cp compare_snaps.F compare_snaps.f90 $(FC) -o compare_snaps $(FCFLAGS) compare_snaps.f90 \rm compare_snaps.f90 clean clena : \rm -f *.o *.cu.c x.cu y.cu swrad_driver_* *.mod *.f90 x.ptx