#! /bin/sh ##SBATCH --account=next #SBATCH --job-name=test_MPI_Spawn # # without any nodes or tasks specification, will allocate 1 task, just what we need for now ##SBATCH --tasks 32 #SBATCH --nodes 4 ##SBATCH --tasks-per-node=8 ##SBATCH --cpus-per-tasks=1 #SBATCH --exclusive # # Acceptable time formats include "minutes", "minutes:seconds", "hours:minutes:seconds", # "days-hours", "days-hours:minutes" and "days-hours:minutes:seconds" #SBATCH --qos=short #SBATCH --time=300 ##SBATCH --time=2:00:00 ##SBATCH --time=23:00:00 # ##SBATCH --qos=medium ##SBATCH --time=160:00:00 ##SBATCH --time=50:00:00 # ##SBATCH --qos=long ##SBATCH --time=200:00:00 ##SBATCH --time=50:00:00 #SBATCH --output=sbatch.64.%j.out ##SBATCH --error=sbatch.1.%j.err #SBATCH --mail-type=ALL,TIME_LIMIT_90,TIME_LIMIT export LANG=C set +x env | egrep SBATCH\|SLURM\|MPI\|OMP\|KMP | sort echo $F_UFMTENDIAN set -x if [ "" = "$I_MPI_PMI_LIBRARY" ] then export I_MPI_PMI_LIBRARY=/p/system/slurm/lib/libpmi.so fi #On 09/04/15 14:20, Ciaron Linstead wrote: #> #> To use the (Intel-recommended) DAPL-UD, the following variables need to be set: #> #export I_MPI_FABRICS=shm:dapl #export I_MPI_DAPL_UD=enable #export I_MPI_DAPL_UD_PROVIDER=ofa-v2-mlx5_0-1u #> #> It seems that these have an effect on larger jobs (e.g. LPJ 256-task), and #> allow I_MPI_DYNAMIC_CONNECTION to be enabled. #export MV2_SMP_USE_CMA=0 set -x ln -s $SLURM_JOBID .workdir_locked_by_batchjob if [ $? -ne 0 ] then # sigh. it might be a restart attempt initated by loadleveler after a vacate order # in that case the job ID is still the same that created the existing lock file. # And if so, we just start anew silently. oldjobid="`ls -l .workdir_locked_by_batchjob | awk '{print $11}'`" if [ "$oldjobid" != "$SLURM_JOBID" ] then echo Error: Directory `pwd` is locked by another batch job: ls -l .workdir_locked_by_batchjob exit 1 else echo own lockfile already there - looks like a slurm restart attempt >> fms.$SLURM_JOBID.restart-log echo `date` `hostname` >> fms.$SLURM_JOBID.restart-log fi fi trap "rm -f .workdir_locked_by_batchjob" EXIT date echo echo Hard limits: ulimit -aH echo #ulimit -c unlimited ulimit -s unlimited ulimit -d unlimited #ulimit -m unlimited #ulimit -v unlimited #ulimit -f unlimited ulimit -t unlimited echo echo Soft limits: ulimit -a echo #echo $LD_LIBRARY_PATH # Intel MPI version 19 ff need some more environment massage mpiexecver=`mpiexec --version | awk '/Intel.R. MPI Library for Linux/ {print $8;}'` mpiexecupd=`mpiexec --version | awk '/Intel.R. MPI Library for Linux/ {print $10;}'` #mpiexecbld=`mpiexec --version | awk '/Intel.R. MPI Library for Linux/ {print $12;}'` case "${mpiexecver}-${mpiexecupd}" in 2015-*|2016-*|2017-*|2018-*) echo 'This MPI version does not support MPI_Comm_Spawn within slurm ??' ;; 2019-*|2021-*|2022-*) unset I_MPI_DAPL_UD unset I_MPI_DAPL_UD_PROVIDER export I_MPI_FABRICS=shm:ofi export FI_MLX_ENABLE_SPAWN=yes #export I_MPI_HYDRA_DEBUG=yes ;; 2021.6-*) unset I_MPI_DAPL_UD unset I_MPI_DAPL_UD_PROVIDER export I_MPI_FABRICS=shm:ofi export I_MPI_SPAWN=on ;; *) echo dont know how to handle this version of mpiexec mpiexec --version # do nothing special ;; esac TOOLSDIR=/p/projects/climber3/petri/POEM/bin mkdir -p RESTART history rm -f *.nc *.nc.[0-9][0-9][0-9][0-9]* #nodelist=`scontrol show hostnames $SLURM_NODELIST` arbitrary() { # parameters: list of numbers of tasks to assigned to each node [[ SHELLOPTS =~ $xtrace ]] && minusx=-x # save the flag setting set +x declare -a tasks nodes tasks=($*) nodes=(`scontrol show hostnames $SLURM_NODELIST`) node_cnt=${#nodes[*]} task_cnt=${#tasks[*]} if [ $node_cnt -lt $task_cnt ] then echo "ERROR: You only have $node_cnt nodes, but requested layout on $task_cnt nodes." >&2 task_cnt=$node_cnt fi cnt=0 layout="" #echo tasks ${tasks[*]} #echo nodes ${nodes[*]} for cnt in `seq 0 $(( $task_cnt - 1 ))` do task=${tasks[$cnt]} node=${nodes[$cnt]} for i in `seq 1 $task` do [ "" != "$layout" ] && layout="${layout}," layout="${layout}${node}" done done echo "$layout" set +x $minusx # restore the flag setting } # concatenate the values of both variables, one might be the empty string if [ 0"$SLURM_NTASKS" -gt 1 -o 0"$SLURM_NNODES" -gt 1 ] then export MPD_CON_EXT=Slurm_Job_$SLURM_JOBID # unset NC_BLKSZ # unset NC_BLOCKSIZE # export SCALASCA_DIR=/home/petri # #export EPK_VERBOSE=1 # #export EPK_TRACE=1 # #export ELG_BUFFER_SIZE=XXX # export ESD_BUFFER_SIZE=1000000 # mpdboot -n $machine_count -r ssh -f mpdhosts.$LOADL_STEP_ID ## -genv F_UFMTENDIAN big -genv NC_BLKSZ 8192 -genv NC_BLOCKSIZE 8192 ## time scalasca -analyze -t -f EPIK.FILTER mpiexec -machinefile cpulist.$LOADL_STEP_ID -n 31 ./fms_MOM_LAD_AEOLUS.x > fms.out-$LOADL_STEP_ID 2>&1 # time mpiexec -machinefile cpulist.$LOADL_STEP_ID -n 31 ./fms_MOM_LAD_AEOLUS.x > fms.out-$LOADL_STEP_ID 2>&1 #export I_MPI_DEBUG=5 #export F_UFMTENDIAN=big # 20 tasks atm + 8 tasks ocn = 28 tasks total # 28 tasks * 2 CPUs per task = 56 CPUs sparse allocated # 56 CPUs / 16 CPUs per node = 4 nodes # 28 tasks total / 4 nodes = 7 tasks per node # 4 * 7 tasks # arb="`./arbitrary.pl 7\,7\,7\,7`" # put all ocean tasks on one node #arb="`./arbitrary.pl 7\,7\,6\,8`" #arb="`arbitrary 7 7 6 8`" #time srun --propagate=ALL -m arbitrary -n 28 -w "$arb" -o fms.out-$SLURM_JOBID-$SLURM_NNODES-64-'%02t' ./fms_CM2M.x for n in 16 24 28 30 32 36 40 48 64 do time mpiexec.hydra -genvall -n $n \ -usize INFINITE \ -outfile-pattern fms.out-$SLURM_JOBID-$SLURM_NNODES-$n-'%r' \ -errfile-pattern fms.out-$SLURM_JOBID-$SLURM_NNODES-$n-'%r' \ ./test_MPI_Spawn success=$? echo Running with $n CPUs exited with $success done # mpdallexit # bases=`ls -1 *.nc.[0-9][0-9][0-9][0-9]* | sed -e "s/.nc.[0-9][0-9][0-9][0-9]\+/.nc/" | sort -u` # time for b in $bases ; do echo mppnccombine $b ; $TOOLSDIR/mppnccombine.pik-hlrs2015-ifort -r $b $b.[0-9][0-9][0-9][0-9]* ; done else #time srun ./fms_CM2M.x > fms.out-$SLURM_JOBID 2>&1 unset SLURM_PMI_KVS_DUP_KEYS unset SLURM_JOB_ID SLURM_STEPID SLURM_NPROCS SLURM_PROCID SLURM_GTIDS time ./fms_CM2M.x > fms.out-$SLURM_JOBID 2>&1 success=$? fi #rm -f cpulist.$LOADL_STEP_ID #rm -f mpdhosts.$LOADL_STEP_ID echo FMS exited with $success if [ 0 = "$success" ] ; then echo FMS run endded successfully begindate=`$TOOLSDIR/time_stamp.csh -bf digital` if [ "$begindate" == "" ] ; then begindate=tmp`date +%Y%j%H%M%S` ; fi enddate=`$TOOLSDIR/time_stamp.csh -ef digital` if [ "$enddate" == "" ] ; then enddate=tmp`date +%Y%j%H%M%S` ; fi rm -f time_stamp.out for i in *.nc ; do mv $i history/$enddate.$i; done tar cvjf history/$enddate.out.tar.bz2 fms.out-$SLURM_JOBID* sbatch*$SLURM_JOBID*.out logfile*out diag_field_log*out diag_integral.out stocks.out cp -p input.nml data_table diag_table field_table RESTART/. mv RESTART $enddate.RESTART tar cvjf history/$enddate.RESTART.tar.bz2 $enddate.RESTART/. mv $enddate.RESTART RESTART else echo FMS run failed with code $success fi rm -f .workdir_locked_by_batchjob exit $success