Changeset 10038 for trunk/Mars
- Timestamp:
- 10/27/10 17:46:20 (14 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/Mars/datacenter/scripts/jobmanager
r10014 r10038 35 35 36 36 # function to continue in loop and go to next script 37 function nextscript()37 function sleepawhile() 38 38 { 39 echo `date +%F\ %T`" sleeping \$$1 = $2 seconds... " >> $jmscriptlog 2>&1 40 sleep $2 39 usedsleeptime=$sleeptime 40 case $1 in 41 "error") if ! [ "$errorsleeptime" = "" ] 42 then 43 if [ $errorsleeptime -lt $sleeptimelimit ] 44 then 45 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc` 46 fi 47 usedsleeptime=$errorsleeptime 48 fi 49 ;; 50 "ok") errorsleeptime=$errorsleeptimedefault 51 ;; 52 esac 53 echo `date +%F\ %T`" sleeping "$usedsleeptime" seconds... (status: "$1")" >> $jmscriptlog 2>&1 41 54 echo "" >> $jmscriptlog 2>&1 55 sleep $usedsleeptime 42 56 continue 43 57 } … … 54 68 # (-hard) -l hostname=compute-* 55 69 # for qstat this returns the jobs running on that node + all jobs in the queue 56 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y - v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '70 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -t 1-`echo $tosubmit` -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` ' 57 71 # alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` ' 58 72 # alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge ' … … 62 76 ;; 63 77 pbs) echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1 64 alias 'queuesubmit'='$pbspath/qsub - l walltime=$walltime -l pmem=$pmem -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH-e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '78 alias 'queuesubmit'='$pbspath/qsub -t 1-`echo $tosubmit` -l walltime=`echo $walltime` -l pmem=`echo $pmem` -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH,SCRIPTNAME=`echo ${scripts[$i]}` -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` ' 65 79 # check queue (restricted to current user only) 66 80 alias 'checkqueue'="$pbspath/qstat -a -u $user | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '" … … 68 82 ;; 69 83 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1 70 alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'84 alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a num=`echo $tosubmit` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor' 71 85 alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus ' 72 break 86 break 73 87 ;; 74 88 *) echo "" >> $jmscriptlog 2>&1 … … 79 93 echo "" >> $jmscriptlog 2>&1 80 94 81 prev=$max 95 # for processing with local storage on different nodes 82 96 currentnode=$minnode 83 97 numevaluated=0 98 84 99 # endless loop 85 100 notcount=0 86 nothingtodocount=087 nothingtodosleeptime=088 101 errorsleeptime=$errorsleeptimedefault 89 102 while (( $notcount < 100 )) 90 103 do 104 # get and set some information for the processing 105 source `dirname $0`/sourcefile 106 # reset some values 107 tosubmit=0 108 idleratio=0 109 addtoscript= 110 111 # get processes in queue 112 q=(`checkqueue 2>&1 `) 113 if echo $q | egrep \(Error\|failed\) 114 then 115 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1 116 printprocesslog "WARN checking query ($queuesys) failed" 117 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog 118 sleepawhile "error" 119 fi 120 121 # general check whether one should submit something depending on chosen algorithm 122 # algorithm 1: 123 # submit new jobs in case there are less than $limitidle idle jobs in the queue 124 # algorithm 2: 125 # submit new jobs in case the total number of jobs in the queue has fallen below $totalpno 126 case $algorithm in 127 1) # algorithm 1 128 # get number of idle jobs in the queue 129 q5=( `echo ${q[@]} | egrep -o \(Jobstatus1\|Jobstatusq\|JobstatusQ\)` ) 130 idle=${#q5[@]} 131 if [ $idle -gt $limitidle ] 132 then 133 echo `date +%F\ %T`" more than "$limitidle" jobs waiting ("$idle")" >> $jmscriptlog 2>&1 134 sleepawhile "ok" 135 fi 136 ;; 137 2) # algorithm 2 138 # get processes of user in queue 139 q1=( `echo ${q[@]} | egrep -o "Owner$user"`) 140 queued=${#q1[@]} 141 hour=`date +%k` 142 # choose array of total number of jobs to be done 143 # according to the day of the week 144 dayofweek=`date +%u` 145 case $dayofweek in 146 0 | 6) totalpno=${pnototal[$hour]} ;; 147 *) totalpno=${pnototalwe[$hour]} ;; 148 esac 149 # get total number of jobs to be submitted 150 if [ $queued -gt $totalpno ] 151 then 152 echo `date +%F\ %T`" more than "$totalpno" jobs waiting ("$queued")" >> $jmscriptlog 2>&1 153 sleepawhile "ok" 154 else 155 tosubmittotal=`echo "$totalpno - $queued" | bc -l` 156 fi 157 ;; 158 *) echo "Please give an algorithm to calculate the number of allowed jobs." 159 exit 160 ;; 161 esac 162 echo `date +%F\ %T`" Total number of jobs to be submitted: "$tosubmittotal >> $jmscriptlog 2>&1 163 164 165 # first loop to determine 166 # a) how many jobs of this script have to be done 167 # b) how many jobs of this script are running or queued 91 168 for (( i=0 ; i < ${#scripts[@]} ; i++ )) 92 169 do 93 source `dirname $0`/sourcefile 94 echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1 95 96 # check if there's something to do 170 # set the step to be evaluated 97 171 step=${scriptscolname[$i]} 98 172 getstepinfo 99 # check if walltime has to be set 100 if [ "$setwalltime" = "yes" ] 101 then 102 walltime=${walltimes[$i]} 103 fi 104 # check if memory has to be set 105 if [ "$setpmem" = "yes" ] 106 then 107 pmem=${pmems[$i]} 108 fi 173 109 174 # check if the script is restricted to one node 110 175 # (i.e. where output of previous step(s) is stored) 111 # this information is taken from the steps.rc file 176 # this information is taken from the steps.rc file 177 # currently this is implemented for sge only 178 # then get number of jobs to be done 112 179 if [ "$noderestricted" = "yes" ] 113 180 then … … 115 182 if [ $numevaluated -ge $numrestrictedscripts ] 116 183 then 117 currentnode=`echo $currentnode + 1| bc -l`184 currentnode=`echo " $currentnode + 1 " | bc -l` 118 185 numevaluated=1 119 186 else 120 numevaluated=`echo $numevaluated + 1| bc -l`187 numevaluated=`echo " $numevaluated + 1 " | bc -l` 121 188 fi 122 189 if [ $currentnode -gt $maxnode ] … … 138 205 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}" 139 206 noderequirementstat=" -l hostname=compute-0-${currentnode}" 207 # get number of jobs to be done from the DB 140 208 getstatus $currentnode >> $jmscriptlog 2>&1 141 209 else 142 210 noderequirementsub="" 143 211 noderequirementstat="" 212 # get number of jobs to be done from the DB 144 213 getstatus >> $jmscriptlog 2>&1 145 214 fi 215 # store the number of processes to be done for this script 216 todo[$i]=$numproc 217 tododb[$i]=$numproc 146 218 147 # check number of processes to be done 148 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/step $db/$step]" >> $jmscriptlog 2>&1 149 if [ "$numproc" = "0" ] 219 # FIXME: sge cuts scriptname to 8 digits in qstat 220 # number of idle jobs, i.e. jobs waiting in the queue to run 221 # condor: 1 222 # sge: q 223 # pbs: Q 224 q4=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus1\|"${scripts[$i]}"Jobstatusq\|"${scripts[$i]}"JobstatusQ\)` ) 225 idlescript[$i]=${#q4[@]} 226 227 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`) 228 queuedscript[$i]=${#q2[@]} 229 230 stillfree[$i]=`echo "${maxjobs[$i]} - ${queuedscript[$i]} " | bc -l` 231 232 if [ $numproc -eq 0 ] || [ ${todo[$i]} -le ${idlescript[$i]} ] || [ ${maxjobs[$i]} -le ${queuedscript[$i]} ] 150 233 then 151 prev=0 152 nothingtodocount=`expr $nothingtodocount + 1` 153 if [ $nothingtodocount -lt ${#scripts[@]} ] 154 then 155 nextscript 0 0 156 else 157 if [ $nothingtodosleeptime -lt $sleeptimelimit ] 234 # store the fraction of cpus to add it to another process 235 idleratio=`echo " ${ratio[$i]} + $idleratio " | bc -l` 236 ratio[$i]=0 237 todo[$i]=0 238 idlenum=$i 239 continue 240 fi 241 done 242 echo `date +%F\ %T`" Evaluated scripts: "${scripts[@]} >> $jmscriptlog 2>&1 243 echo `date +%F\ %T`" Running scripts: "${queuedscript[@]}" (max: "${maxjobs[@]}")" >> $jmscriptlog 2>&1 244 echo `date +%F\ %T`" Number of jobs to be done (from DB): "${tododb[@]} >> $jmscriptlog 2>&1 245 echo `date +%F\ %T`" Number of jobs to be done (updated): "${todo[@]} >> $jmscriptlog 2>&1 246 echo `date +%F\ %T`" Ratio: "${ratio[@]}" (idle: "$idleratio")" >> $jmscriptlog 2>&1 247 248 # loop to update the ratio taking into account the ratio of 249 # a) steps where nothing has to done 250 # b) steps where already enough jobs are in the queue 251 # sum up this idle ratio 252 # determine for which step still most jobs have to be done 253 if ! [ "$idleratio" = "0" ] 254 then 255 addtoscript= 256 for (( i=0 ; i < ${#scripts[@]} ; i++ )) 257 do 258 if [ ${todo[$i]} -gt ${todo[$idlenum]} ] && [ ${todo[$i]} -gt 0 ] 259 then 260 if ! [ "$addtoscript" = "" ] 158 261 then 159 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc` 262 if [ ${todo[$i]} -lt ${todo[$addtoscript]} ] 263 then 264 continue 265 fi 160 266 fi 161 nextscript nothingtodosleeptime $nothingtodosleeptime 162 fi 267 addtoscript=$i 268 fi 269 done 270 271 # continue in case nothing has to be done for all steps 272 # else: update the ratio for the step where most jobs have to be done 273 # by adding the idle ratio 274 if [ "$addtoscript" = "" ] 275 then 276 echo `date +%F\ %T`" No jobs to be done for any step." >> $jmscriptlog 2>&1 277 sleepawhile "ok" 163 278 else 164 nothingtodocount=0 165 nothingtodosleeptime=0 166 fi 167 168 # get processes in queue 169 q=(`checkqueue 2>&1 `) 170 if echo $q | egrep \(Error\|failed\) 171 then 172 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1 173 printprocesslog "WARN checking query ($queuesys) failed" 174 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog 175 nextscript sleeptime $sleeptime 176 fi 177 # FIXME: sge cuts scriptname to 8 digits in qstat 178 # get processes of user in queue 179 q1=( `echo ${q[@]} | egrep -o "Owner$user"`) 180 queued=${#q1[@]} 181 # get number of scripts in queue 182 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`) 183 queuedscript=${#q2[@]} 184 # get running scripts 185 # condor: 2 186 # sge: r 187 # pbs: R 188 q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\|"${scripts[$i]}"JobstatusR\)` ) 189 runningscript=${#q3[@]} 190 stillinqueue=`echo $queuedscript - $runningscript | bc ` 191 192 #get total number of allowed process for current time 193 hour=`date +%k` 194 if [ ${pnototal[$hour]} -lt $totalmax ] 195 then 196 totalpno=${pnototal[$hour]} 197 else 198 totalpno=$totalmax 279 ratio[$addtoscript]=`echo " ${ratio[$addtoscript]} + $idleratio " | bc -l` 280 fi 281 fi 282 echo `date +%F\ %T`" Updated ratio: "${ratio[@]} >> $jmscriptlog 2>&1 283 284 285 # loop to submit jobs to queueing system 286 for (( i=0 ; i < ${#scripts[@]} ; i++ )) 287 do 288 # calculate number of jobs to be submitted 289 tosubmit=`echo "scale=0; $tosubmittotal * ${ratio[$i]} / 1 " | bc -l` 290 if [ ${todo[$i]} -lt $tosubmit ] 291 then 292 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${todo[$i]} >> $jmscriptlog 2>&1 293 tosubmit=${todo[$i]} 294 fi 295 if [ $tosubmit -eq 0 ] 296 then 297 echo `date +%F\ %T`" No jobs to be submitted for script '"${scripts[$i]}"'" >> $jmscriptlog 2>&1 298 continue 299 fi 300 if [ $tosubmit -gt ${stillfree[$i]} ] 301 then 302 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${stillfree[$i]} >> $jmscriptlog 2>&1 303 tosubmit=${stillfree[$i]} 199 304 fi 200 305 201 #choose array according to the day of the week 202 dayofweek=`date +%u` 203 case $dayofweek in 204 0 | 6) pnos=( ${pnoswe[@]} ) ;; 205 *) pnos=( ${pnosweek[@]} ) ;; 206 esac 207 # get number of allowed scripts for current time 208 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc ` 209 pnoscript=${pnos[$num]} 210 # if there was nothing to do for previous script, more scripts can be allowed 211 if [ $prev -eq 0 ] 212 then 213 echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1 214 pnoscript=$max 215 fi 216 echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1 217 echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1 218 219 # continue if there are already enough processes or scripts in the queue 220 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ] 221 then 222 nextscript sleeptime $sleeptime 223 fi 224 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done 225 if [ $numproc -le $stillinqueue ] 226 then 227 echo `date +%F\ %T`" \$numproc ($numproc) < \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1 228 nextscript sleeptime $sleeptime 229 fi 230 231 # reset prev 232 prev=$max 233 234 # submit 1 script to queuing system 306 # set the step to be evaluated 307 step=${scriptscolname[$i]} 308 # check if walltime has to be set 309 if [ "$setwalltime" = "yes" ] 310 then 311 walltime=${walltimes[$i]} 312 fi 313 # check if memory has to be set 314 if [ "$setpmem" = "yes" ] 315 then 316 pmem=${pmems[$i]} 317 fi 318 319 # submit $tosubmit scripts to queuing system 320 echo `date +%F\ %T`" Submitting "$tosubmit" jobs for script '"${scripts[$i]}"' to "$queuesys >> $jmscriptlog 2>&1 235 321 date=`date +%Y-%m-%d` 236 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1237 322 if ! queuesubmit 2>> $jmerrorlog 238 323 then … … 240 325 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1 241 326 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed" 242 if [ $errorsleeptime -lt $sleeptimelimit ] 243 then 244 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc` 245 fi 246 nextscript errorsleeptime $errorsleeptime 247 else 248 errorsleeptime=$errorsleeptimedefault 249 fi 250 nextscript sleeptime $sleeptime 327 sleepawhile "error" 328 fi 251 329 done 330 sleepawhile "ok" 252 331 done 253 332
Note:
See TracChangeset
for help on using the changeset viewer.