- Timestamp:
- 08/12/09 18:56:02 (15 years ago)
- Location:
- trunk/MagicSoft/Mars
- Files:
-
- 2 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/MagicSoft/Mars/Changelog
r9487 r9488 18 18 19 19 -*-*- END OF LINE -*-*- 20 21 2009/08/12 Daniela Dorner 22 23 * datacenter/scripts/jobmanager: 24 - improved logging (new layout and more information) 25 - moved sleep to end of loop (function nextscript()) 26 27 20 28 21 29 2009/08/12 Daniel Hoehne-Moench -
trunk/MagicSoft/Mars/datacenter/scripts/jobmanager
r9474 r9488 34 34 set -C 35 35 36 # function to continue in loop and go to next script 37 function nextscript() 38 { 39 echo `date +%F\ %T`" sleeping \$$1 = $sleeptime seconds... " >> $jmscriptlog 2>&1 40 sleep $2 41 echo "" >> $jmscriptlog 2>&1 42 continue 43 } 44 36 45 echo "" >> $jmscriptlog 2>&1 37 echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1 46 echo "" >> $jmscriptlog 2>&1 47 echo -n `date +%F\ %T`" starting jobmanager for setup " >> $jmscriptlog 2>&1 38 48 39 49 # decide which jobmanager you want to run … … 41 51 # the number of jobs are defined in the file setup 42 52 case $1 in 43 data) echo "running jobmanager for data" >> $jmscriptlog 2>&153 data) echo -n "'data'" >> $jmscriptlog 2>&1 44 54 scripts=( "runganymed" "runstar" "runcallisto" ) # not used: "dodatacheck" "cutslices" 45 55 scriptscolname=( "fGanymed" "fStar" "fCallisto" ) # not used: "fDataCheckDone" "fCompmux" … … 48 58 break 49 59 ;; 50 mc) echo "running jobmanager for mc" >> $jmscriptlog 2>&160 mc) echo -n "'mc'" >> $jmscriptlog 2>&1 51 61 scripts=( "runcorsika" "runreflector" "runcamera" ) 52 62 scriptscolname=( "fCorsikaFileAvail" "fReflectorFileAvail" "fCameraFileAvail" ) … … 55 65 break 56 66 ;; 57 ctamc) echo "running jobmanager for cta mc" >> $jmscriptlog 2>&167 ctamc) echo -n "'cta mc'" >> $jmscriptlog 2>&1 58 68 scripts=( "runsimtel" ) 59 69 scriptscolname=( "fCorsikaSimTelarray" ) … … 62 72 break 63 73 ;; 64 *) echo "'$1' is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1 74 *) echo "" 75 echo "'$1' is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1 65 76 echo "'$1' is a wrong commandline option for jobmanager -> exit" 66 77 printprocesslog "WARN '$1' is a wrong commandline option for jobmanager" … … 72 83 # choose commands according to queueing system (defined in setup) 73 84 case $queuesys in 74 sge) echo " setting commands for sun grid engine" >> $jmscriptlog 2>&185 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1 75 86 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` ' 76 87 # alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge ' … … 78 89 break 79 90 ;; 80 condor) echo " setting commands for condor" >> $jmscriptlog 2>&191 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1 81 92 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor' 82 93 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus ' 83 94 break 84 95 ;; 85 *) finish >> $jmscriptlog 2>&1 96 *) echo "" >> $jmscriptlog 2>&1 97 finish >> $jmscriptlog 2>&1 86 98 ;; 87 99 esac 100 101 echo "" >> $jmscriptlog 2>&1 88 102 89 103 prev=$max … … 98 112 for (( i=0 ; i < ${#scripts[@]} ; i++ )) 99 113 do 100 date >> $jmscriptlog 2>&1101 114 source `dirname $0`/sourcefile 102 echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1115 echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1 103 116 104 117 # check if there's something to do 105 118 column=${scriptscolname[$i]} 106 119 getstatus >> $jmscriptlog 2>&1 107 echo " $numproc ${scripts[$i]} still to do" >> $jmscriptlog 2>&1120 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1 108 121 if [ "$numproc" = "" ] 109 122 then … … 112 125 if [ $nothingtodocount -lt ${#scripts[@]} ] 113 126 then 114 cont >> $jmscriptlog 2>&1127 nextscript 0 0 115 128 else 116 129 if [ $nothingtodosleeptime -lt $sleeptimelimit ] … … 118 131 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc` 119 132 fi 120 echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1 121 sleep $nothingtodosleeptime 122 cont >> $jmscriptlog 2>&1 133 nextscript nothingtodosleeptime $nothingtodosleeptime 123 134 fi 124 135 else … … 126 137 nothingtodosleeptime=0 127 138 fi 128 129 echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1130 sleep $sleeptime131 139 132 140 # get processes in queue … … 136 144 if echo $q | egrep \(Error\|failed\) 137 145 then 138 echo `date `" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1146 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1 139 147 printprocesslog "WARN checking query ($queuesys) failed" 140 echo `date `" WARN checking query ($queuesys) failed" >> $jmerrorlog141 cont >> $jmscriptlog 2>&1148 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog 149 nextscript sleeptime $sleeptime 142 150 fi 143 151 # get processes of user in queue … … 154 162 #get total number of allowed process for current time 155 163 hour=`date +%k` 156 #totalpno=${pnototal[$hour]}157 164 if [ ${pnototal[$hour]} -lt $totalmax ] 158 165 then … … 174 181 if [ $prev -eq 0 ] 175 182 then 176 echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1183 echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1 177 184 pnoscript=$max 178 185 fi 179 echo " found $queued jobs in the queue (incl. running jobs) [allowed$totalpno]" >> $jmscriptlog 2>&1180 echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1186 echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1 187 echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1 181 188 182 189 # continue if there are already enough processes or scripts in the queue 183 190 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ] 184 191 then 185 cont >> $jmscriptlog 2>&1192 nextscript sleeptime $sleeptime 186 193 fi 187 194 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done 188 195 if [ $numproc -le $stillinqueue ] 189 196 then 190 echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1191 cont >> $jmscriptlog 2>&1197 echo `date +%F\ %T`" \$numproc ($numproc) < \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1 198 nextscript sleeptime $sleeptime 192 199 fi 193 200 … … 197 204 # submit 1 script to queuing system 198 205 date=`date +%Y-%m-%d` 199 echo " committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1206 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1 200 207 # if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog 201 208 # if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog … … 203 210 then 204 211 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog 205 echo "$queuesys is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1212 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1 206 213 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed" 207 214 if [ $errorsleeptime -lt $sleeptimelimit ] … … 209 216 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc` 210 217 fi 211 sleep$errorsleeptime218 nextscript errorsleeptime $errorsleeptime 212 219 else 213 220 errorsleeptime=$errorsleeptimedefault 214 221 fi 215 date >> $jmscriptlog 2>&1 216 echo "" >> $jmscriptlog 2>&1 222 nextscript sleeptime $sleeptime 217 223 done 218 224 done
Note:
See TracChangeset
for help on using the changeset viewer.