#			    -*-makefile-*-
# template for the Intel fortran compiler version 15ff with Intel MPI on the PIK HLRS2015 cluster
# typical use with mkmf
# mkmf -t template.ifc -c"-Duse_libMPI -Duse_netCDF" path_names /usr/local/include

# -fpp : run the Fortran preprocessor before compilation
# -fno-alias : If you do not want aliasing to be assumed in the program
# -stack_temps : allocate space for temporary arrays on the runtime stack
# -safe-cray-ptr : Cray* pointers do not alias other variables.
# -ftz : Flushes denormal results to zero.
#        This  option  flushes  denormal  results  to zero when the
#        application is in  the  gradual  underflow  mode.  It  may
#        improve  performance if the denormal values are not criti
#        cal to your application's behavior.
#
#        If this option produces undesirable results of the numeri
#        cal  behavior  of  your  program, you can turn the FTZ/DAZ
#        mode off by using -no-ftz or /Qftz- in  the  command  line
#        while still benefiting from the O3 optimizations.
#
#        NOTE: Options -ftz and /Qftz are performance options. Set
#        ting these options does not guarantee that  all  denormals
#        in  a  program are flushed to zero. They only cause denor
#        mals generated at run time to be flushed to zero.
# -shared-intel: Causes Intel-provided libraries to be linked in dynamically.
# -assume byterecl : the  units for the OPEN statement RECL  specifier
#                    (record length) value are in bytes for  unformatted  data,
#                    not longwords
# -Wp,-w : pass option -p to the preprocessor
#          Prevents warnings from being output.
# -sox : save the compilation options and version number in the executable
FFLAGS_BASE =  -fpp -fno-alias -stack_temps -safe_cray_ptr -ftz -shared-intel -assume byterecl -g -i4 -r8 -Wp,-w -sox

# -fltconsistency : Enables improved floating-point consistency.
#        This  option  enables  improved floating-point consistency
#        and may slightly reduce execution speed. It limits  float
#        ing-point  optimizations and maintains declared precision.
#        It also disables inlining of math library functions.
#
#        - Even if vectorization is enabled by the -x options, the
#          compiler does not  vectorize  reduction  loops  (loops
#          computing  the  dot  product)  and loops with mixed
#          precision types. Similarly, the compiler  does  not
#          enable  certain  loop transformations. For example,
#          the compiler does not transform reduction loops  to
#          perform partial summation or loop interchange.
#
#        This  option  causes  performance  degradation relative to
#        using default floating-point optimization flags.
#        The  recommended method to control the semantics of float
#        ing-point calculations is to use option  -fp-model
#FFLAGS_REPRO = -fltconsistency
#
# -fp-model precise  Disables optimizations that are not  value-safe [..]
#     These  semantics  ensure  the reproducibility of floating-point computations
#     for serial code, including code vectorized or auto-parallelized by the compiler
#        Run-to-run reproducibility for floating-point reductions in OpenMP* code may be obtained for
#        a fixed number of threads through the KMP_DETERMINISTIC_REDUCTION environment variable.  For
#        more information about this environment variable, see Supported Environment Variables
#FFLAGS_REPRO = -fp-model precise
#
## Su-Bong Lee <sky-shine@pusan.ac.kr> wrote on 13.3.2012:
## I also did experiment using "-fp_model precise" option with "-O2",
## Then the outputs from two identical restart run were the same.
## Marshall Ward <marshall.ward@anu.edu.au> answered:
## We are seeing similar issues on our machine in Australia using the intel 
## compilers (with -O2).
## 
## On single CPU submissions of the bowl1 experiment, we saw two different 
## solutions, on the order of floating point error. The answer we get is 
## quasi-random, with no clear explanation.
## 
## As Swathi recommends, using '-fp-model precise' gives a third solution 
## different from the other two, which we can reproduce consistently. Using 
## -fltconsistency did not address the issue for us either. This link 
## suggests Intel may soon deprecate -fltconsistency: http://goo.gl/IPBkH

## -fp-model consistent generate code that will give consistent,
##    reproducible floating-point results for different optimization
##    levels or between different processors of the same architecture.
##    See the article titled: Consistency of Floating-Point Results using the Intel(R) Compiler
## http://software.intel.com/en-us/articles/consistency-of-floating-point-results-using-the-intel-compiler/
#FFLAGS_REPRO = -fp-model consistent
##
## [..] Dynamic variations in heap alignment can lead to variations in
##   floating-point results in a similar manner. Such variations in alignment
##   typically arise from memory allocations that depend on the external
##   environment.  They can be prevented from causing variations in floating-point
##   results by building with -fp-model precise, or by explicit alignment of data
##   arrays. Starting from the version 15 compiler, such run-to-run variations can
##   also be prevented by compiling with -qno-opt-dynamic-align, which is expected
##   to have much less impact on performance than -fp-model precise.
## SIGH. With ifort 15 this option seems to be buggy. Use -fp-model precise instead.
##       With ifort 17.0.1 it works better
FFLAGS_REPRO = -qno-opt-dynamic-align
#FFLAGS_REPRO = -fp-model precise -qno-opt-dynamic-align

# Run-to-run reproducibility for floating-point reductions in OpenMP
# code may be obtained for a fixed number of threads through the
# KMP_DETERMINISTIC_REDUCTION environment variable.
# KMP_DETERMINISTIC_REDUCTION
#  Enables  (true)  or  disables  (false)  the  use  of a specific ordering of the
#  reduction operations for implementing  the  reduction  clause  for  an  OpenMP*
#  parallel  region. This has the effect that, for a given number of threads, in a
#  given parallel region, for a given data set and reduction operation, a floating
#  point  reduction done for an OpenMP* reduction clause has a consistent floating
#  point result from run to run, since round-off errors are identical.


# -check : Checks for certain conditions at run time.
# -check all : is the same as specifying check with no keyword
# -check pointers :
#              Note: with recent compiler versions, this produces
#              errors about use of unallocated U_SURF on atmos PEs in
#              flux_exchange.F90:flux_ocean_to_ice(). However,
#              the invoked mpp_redistribute() knows how to handle
#              such unallocated arrays.
# -check bounds : compile-time and run-time checking for array subscripts
# -check arg_temp_created: Determines whether checking occurs for actual
#         arguments before routine calls.
# -WB : Turns a compile-time bounds check into a warning.
# -inline_debug_info : use -debug inline-debug-info
# -fpe0 : Floating-point invalid, divide-by-zero, and overflow exceptions
#        are enabled. If any such exceptions occur, execution is aborted.
#        This option sets the -ftz ; therefore underflow results will be set
#        to zero unless you explicitly specify -no-ftz
# -fpe3 : (default) All floating-point exceptions are disabled.
#        Floating-point underflow is gradual
# -ftrapuv : Initializes  stack  local variables to an unusual value to aid error detection.
# -fp-stack-check : generate  extra  code  after  every function call
#        By default, there is no checking. So  when  the  FP  stack
#        overflows, a NaN value is put into FP calculations and the
#        program's  results  differ.  Unfortunately,  the  overflow
#        point  can  be  far away from the point of the actual bug.
#        This option places code that causes  an  access  violation
#        exception immediately after an incorrect call occurs, thus
#        making it easier to locate these issues.

# -traceback should not cause runtime overhead (?), thus we want it for production also
FFLAGS_DEBUG = -traceback 
#FFLAGS_DEBUG += -warn -warn noerrors
#FFLAGS_DEBUG += -warn nointerfaces
# options that possibly create runtime overhead, thus should be used during development
# but nor for production.
#FFLAGS_DEBUG += -check bounds
#FFLAGS_DEBUG += -check all
#FFLAGS_DEBUG += -check all\,noarg_temp_created
#FFLAGS_DEBUG += -check all\,nopointers
#FFLAGS_DEBUG += -Wb
#FFLAGS_DEBUG += -fpe0 -ftrapuv
#FFLAGS_DEBUG += -init=snan -init=arrays
#FFLAGS_DEBUG += -debug variable_locations -debug-parameters -debug inline-debug-info

#FFLAGS_OPT = -O2 -no-vec # default from GFDL
#FFLAGS_OPT = -O3 -xHost -ipo
FFLAGS_OPT = -O3 -xHost
#FFLAGS_OPT = -O3 -xHost -qopt-report -qopt-report-annotate
#FFLAGS_OPT = -O3 -xHost -parallel -guide

FFLAGS = $(FFLAGS_BASE) $(FFLAGS_REPRO) $(FFLAGS_OPT) $(FFLAGS_DEBUG)

# -Wremarks is needed for ifort >= 12 to turn on remarks
# -diag-disable : turn off some annoying remarks
# remark #1: last line of file ends without a newline
# remark #981: operands are evaluated in unspecified order
# remark #1418: external function definition with no prior declaration
# remark #1419: external declaration in primary source file
# remark #1572: floating-point equality and inequality comparisons are unreliable
# remark #1782 : #pragma once is obsolete
# warning #3180: unrecognized OpenMP #pragma
# warning #5194: Source line truncated
# warning #6717: This name has not been given an explicit type

CXXFLAGS_BASE = -Wall -Wremarks -g -debug inline-debug-info -debug extended
CXXFLAGS_BASE += -diag-disable 1,981,1782,1572,1418,1419,3180 # ,5194,6717
CXXFLAGS_BASE += -ftz -fno-exceptions
CXXFLAGS_BASE += -no-inline-factor
#CXXFLAGS_BASE += -DONLY_TRANSPORT_ON_REDUCED_GRID
CXXFLAGS_REPRO = $(FFLAGS_REPRO) # make it consistent with F90 compilation
#CXXFLAGS_OPT = -O2 -no-vec # default from GFDL
CXXFLAGS_OPT = -O3 -xHOST
CXXFLAGS_OPT += -DNDEBUG # removes assertions. Greatly improves speed, but must be used only after extensive testing
# -traceback should not cause runtime overhead (?), thus we want it for production also
CXXFLAGS_DEBUG = -traceback
# options that create runtime overhead, not desirable for production runs
#CXXFLAGS_DEBUG += -check-uninit -ftrapuv -fstack-security-check
CXXFLAGS = $(CXXFLAGS_BASE) $(CXXFLAGS_REPRO) $(CXXFLAGS_OPT) $(CXXFLAGS_DEBUG)

# Sigh. the PIK Machine learning interface to libtorch must be
# compiled with gcc-7.3.0
GXX=/p/system/packages/compiler/gnu/7.3.0/bin/g++
GXXFLAGS=-g -Wall -O

CPPFLAGS = $(NETCDFINCLUDE)
FC = mpiifort
LD = mpiifort
CC = mpiicc
CXX= mpiicpc
# when -g is given to the linking step, Intel will use the debugging version of the MPI library.
# Thus, for production performance, link without -g
# -sox Tells the compiler to save  the  compilation  options  and
#      version number in the Linux* OS executable
# to enable debugging, we must prevent IPO
LDFLAGS = -sox -g $(FFLAGS_REPRO) $(NETCDFLIBPATH) -traceback
#LDFLAGS += -no-ipo
#LDFLAGS += -lchkp -lchkpwrap
#LDFLAGS += -lmcheck 
#LDFLAGS = -L/home/petri/netcdf-4.2.1.1-intel15/lib -lnetcdf_c++ -lnetcdff -lnetcdf -L/home/petri/hdf5-1.8.9-intel15/lib -lhdf5_hl -lhdf5 -lz -L$(UDUNITSROOT)/lib -ludunits2 -lstdc++

CFLAGS_BASE = -D__IFC -g -Wall -Wremarks
CFLAGS_REPRO = $(FFLAGS_REPRO)
CFLAGS_DEBUG = -traceback
# options that create runtime overhead, not desirable for production runs
#CFLAGS_DEBUG += -check-uninit -ftrapuv
#CFLAGS_DEBUG += -check=stack\,uninit -check-pointers=rw -check-pointers-dangling=all -fstack-security-check
#CFLAGS_OPT = -O2 -no-vec
CFLAGS_OPT = -O3 -xHOST
CFLAGS_OPT += -DNDEBUG
CFLAGS = $(CFLAGS_BASE) $(CFLAGS_REPRO) $(CFLAGS_DEBUG) $(CFLAGS_OPT)

LPJ_OPTFLAGS=$(CFLAGS_REPRO) -traceback
LPJ_OPTFLAGS+= -O3 -no-ipo -xHOST -no-prec-div
#LPJ_OPTFLAGS+= -O2 -no-ipo
#LPJ_OPTFLAGS+= -g -no-ipo

# the Intel compiler requires that libraries are built using xiar if -ipo is used as compiler / linker option.
# LPJ wants that option, thus we need xiar here.
AR = xiar