#! /bin/sh # @ group = climber3 # @ job_type = serial ## @ total_tasks = 32 ## @ blocking = 8 ## @ total_tasks = 31 ## @ blocking = unlimited ## @ requirements = (Feature == "blue") # ## @ class = short ## @ wall_clock_limit= 2:00:00 ## @ wall_clock_limit= 23:00:00 # # @ class = medium # @ wall_clock_limit= 168:00:00 ## @ wall_clock_limit= 160:00:00 ## @ wall_clock_limit= 50:00:00 # ## @ class = long ## @ wall_clock_limit= 200:00:00 ## @ wall_clock_limit= 50:00:00 ## @ class = largemem ## @ wall_clock_limit= 23:00:00 ## @ wall_clock_limit= 168:00:00 ## @ wall_clock_limit= 150:00:00 ## @ wall_clock_limit= 300:00:00 # # @ as_limit = unlimited # @ data_limit = unlimited # @ stack_limit = unlimited # @ file_limit = unlimited # @ nofile_limit = unlimited # @ core_limit = unlimited ## @ image_size = 0 # @ output = fms.1.$(Host).$(Cluster).$(Process).out # @ error = fms.1.$(Host).$(Cluster).$(Process).err # @ notification = always # @ queue set -x ln -s $LOADL_STEP_ID .workdir_locked_by_batchjob if [ $? -ne 0 ] then # sigh. it might be a restart attempt initated by loadleveler after a vacate order # in that case the job ID is still the same that created the existing lock file. # And if so, we just start anew silently. oldjobid="`ls -l .workdir_locked_by_batchjob | awk '{print $11}'`" if [ "$oldjobid" != "$LOADL_STEP_ID" ] then echo Error: Directory `pwd` is locked by another batch job: ls -l .workdir_locked_by_batchjob exit 1 else echo own lockfile already there - looks like a Loadl restart attempt >> fms.$LOADL_STEP_ID.restart-log echo `date` `hostname` >> fms.$LOADL_STEP_ID.restart-log fi fi date date >&2 #ulimit -c unlimited #ulimit -s unlimited #ulimit -d unlimited #ulimit -m unlimited #ulimit -v unlimited #ulimit -f unlimited ulimit -a mkdir -p RESTART history llgetmachinelist | sed "s/blade/bi/" > cpulist.$LOADL_STEP_ID sort -u cpulist.$LOADL_STEP_ID > mpdhosts.$LOADL_STEP_ID machine_count=`cat mpdhosts.$LOADL_STEP_ID | wc -l` cpu_count=`cat cpulist.$LOADL_STEP_ID | wc -l` rm -f *.nc *.nc.[0-9][0-9][0-9][0-9]* if [ $cpu_count -gt 1 ] then export MPD_CON_EXT=$LOADL_STEP_ID echo $machine_count machines $cpu_count cpus >&2 sort cpulist.$LOADL_STEP_ID | uniq -c >&2 #for h in `cat mpdhosts.$LOADL_STEP_ID` #do # echo $h # ssh $h ncdump `pwd`/INPUT/grid_spec.nc #done # unset NC_BLKSZ # unset NC_BLOCKSIZE # export SCALASCA_DIR=/home/petri # #export EPK_VERBOSE=1 # #export EPK_TRACE=1 # #export ELG_BUFFER_SIZE=XXX # export ESD_BUFFER_SIZE=1000000 mpdboot -n $machine_count -r ssh -f mpdhosts.$LOADL_STEP_ID # -genv F_UFMTENDIAN big -genv NC_BLKSZ 8192 -genv NC_BLOCKSIZE 8192 # time scalasca -analyze -t -f EPIK.FILTER mpiexec -machinefile cpulist.$LOADL_STEP_ID -n 31 ./fms_MOM_LAD_AEOLUS.x > fms.out-$LOADL_STEP_ID 2>&1 time mpiexec -machinefile cpulist.$LOADL_STEP_ID -n 31 ./fms_MOM_LAD_AEOLUS.x > fms.out-$LOADL_STEP_ID 2>&1 success=$? mpdallexit bases=`ls -1 *.nc.[0-9][0-9][0-9][0-9]* | sed -e "s/.nc.[0-9][0-9][0-9][0-9]\+/.nc/" | sort -u` for b in $bases ; do echo mppnccombine $b ; /iplex/01/climber3/petri/mom5.0.2/bin/mppnccombine.pik-iplex-ifort11 -r $b $b.[0-9][0-9][0-9][0-9]* ; done else time ./fms_MOM_LAD_AEOLUS.x > fms.out-$LOADL_STEP_ID 2>&1 success=$? fi rm -f cpulist.$LOADL_STEP_ID rm -f mpdhosts.$LOADL_STEP_ID echo FMS exited with $success if [ 0 = "$success" ] ; then echo FMS run endded successfully begindate=`/iplex/01/climber3/petri/mom5.0.2/bin/time_stamp.csh -bf digital` if [ "$begindate" == "" ] ; then begindate=tmp`date +%Y%j%H%M%S` ; fi enddate=`/iplex/01/climber3/petri/mom5.0.2/bin/time_stamp.csh -ef digital` if [ "$enddate" == "" ] ; then enddate=tmp`date +%Y%j%H%M%S` ; fi rm -f time_stamp.out for i in *.nc ; do mv $i history/$enddate.$i; done tar cvjf history/$enddate.out.tar.bz2 fms.out-$LOADL_STEP_ID *.out cp -p input.nml data_table diag_table field_table RESTART/. mv RESTART $enddate.RESTART tar cvjf history/$enddate.RESTART.tar.bz2 $enddate.RESTART/. mv $enddate.RESTART RESTART else echo FMS run failed with code $success fi rm -f .workdir_locked_by_batchjob exit $success