source: trunk/Mars/datacenter/scripts/jobmanager@ 10305

Last change on this file since 10305 was 10038, checked in by Daniela Dorner, 14 years ago
new algorithm
  • Property svn:executable set to *
File size: 13.1 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2010
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis)
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36# function to continue in loop and go to next script
37function sleepawhile()
38{
39 usedsleeptime=$sleeptime
40 case $1 in
41 "error") if ! [ "$errorsleeptime" = "" ]
42 then
43 if [ $errorsleeptime -lt $sleeptimelimit ]
44 then
45 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
46 fi
47 usedsleeptime=$errorsleeptime
48 fi
49 ;;
50 "ok") errorsleeptime=$errorsleeptimedefault
51 ;;
52 esac
53 echo `date +%F\ %T`" sleeping "$usedsleeptime" seconds... (status: "$1")" >> $jmscriptlog 2>&1
54 echo "" >> $jmscriptlog 2>&1
55 sleep $usedsleeptime
56 continue
57}
58
59echo "" >> $jmscriptlog 2>&1
60echo "" >> $jmscriptlog 2>&1
61echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
62
63user=`whoami`
64
65# choose commands according to queueing system (defined in setup)
66case $queuesys in
67 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
68 # (-hard) -l hostname=compute-*
69 # for qstat this returns the jobs running on that node + all jobs in the queue
70 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -t 1-`echo $tosubmit` -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
71# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
72# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
73 # FIXME: get complete scriptname (including command line option), needed for runstereo
74 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
75 break
76 ;;
77 pbs) echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1
78 alias 'queuesubmit'='$pbspath/qsub -t 1-`echo $tosubmit` -l walltime=`echo $walltime` -l pmem=`echo $pmem` -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH,SCRIPTNAME=`echo ${scripts[$i]}` -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
79 # check queue (restricted to current user only)
80 alias 'checkqueue'="$pbspath/qstat -a -u $user | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '"
81 break
82 ;;
83 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
84 alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a num=`echo $tosubmit` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
85 alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
86 break
87 ;;
88 *) echo "" >> $jmscriptlog 2>&1
89 finish >> $jmscriptlog 2>&1
90 ;;
91esac
92
93echo "" >> $jmscriptlog 2>&1
94
95# for processing with local storage on different nodes
96currentnode=$minnode
97numevaluated=0
98
99# endless loop
100notcount=0
101errorsleeptime=$errorsleeptimedefault
102while (( $notcount < 100 ))
103do
104 # get and set some information for the processing
105 source `dirname $0`/sourcefile
106 # reset some values
107 tosubmit=0
108 idleratio=0
109 addtoscript=
110
111 # get processes in queue
112 q=(`checkqueue 2>&1 `)
113 if echo $q | egrep \(Error\|failed\)
114 then
115 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
116 printprocesslog "WARN checking query ($queuesys) failed"
117 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
118 sleepawhile "error"
119 fi
120
121 # general check whether one should submit something depending on chosen algorithm
122 # algorithm 1:
123 # submit new jobs in case there are less than $limitidle idle jobs in the queue
124 # algorithm 2:
125 # submit new jobs in case the total number of jobs in the queue has fallen below $totalpno
126 case $algorithm in
127 1) # algorithm 1
128 # get number of idle jobs in the queue
129 q5=( `echo ${q[@]} | egrep -o \(Jobstatus1\|Jobstatusq\|JobstatusQ\)` )
130 idle=${#q5[@]}
131 if [ $idle -gt $limitidle ]
132 then
133 echo `date +%F\ %T`" more than "$limitidle" jobs waiting ("$idle")" >> $jmscriptlog 2>&1
134 sleepawhile "ok"
135 fi
136 ;;
137 2) # algorithm 2
138 # get processes of user in queue
139 q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
140 queued=${#q1[@]}
141 hour=`date +%k`
142 # choose array of total number of jobs to be done
143 # according to the day of the week
144 dayofweek=`date +%u`
145 case $dayofweek in
146 0 | 6) totalpno=${pnototal[$hour]} ;;
147 *) totalpno=${pnototalwe[$hour]} ;;
148 esac
149 # get total number of jobs to be submitted
150 if [ $queued -gt $totalpno ]
151 then
152 echo `date +%F\ %T`" more than "$totalpno" jobs waiting ("$queued")" >> $jmscriptlog 2>&1
153 sleepawhile "ok"
154 else
155 tosubmittotal=`echo "$totalpno - $queued" | bc -l`
156 fi
157 ;;
158 *) echo "Please give an algorithm to calculate the number of allowed jobs."
159 exit
160 ;;
161 esac
162 echo `date +%F\ %T`" Total number of jobs to be submitted: "$tosubmittotal >> $jmscriptlog 2>&1
163
164
165 # first loop to determine
166 # a) how many jobs of this script have to be done
167 # b) how many jobs of this script are running or queued
168 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
169 do
170 # set the step to be evaluated
171 step=${scriptscolname[$i]}
172 getstepinfo
173
174 # check if the script is restricted to one node
175 # (i.e. where output of previous step(s) is stored)
176 # this information is taken from the steps.rc file
177 # currently this is implemented for sge only
178 # then get number of jobs to be done
179 if [ "$noderestricted" = "yes" ]
180 then
181 # get number of next node
182 if [ $numevaluated -ge $numrestrictedscripts ]
183 then
184 currentnode=`echo " $currentnode + 1 " | bc -l`
185 numevaluated=1
186 else
187 numevaluated=`echo " $numevaluated + 1 " | bc -l`
188 fi
189 if [ $currentnode -gt $maxnode ]
190 then
191 currentnode=$minnode
192 fi
193 # check if node is excluded
194 for excludednode in ${excludednodes[@]}
195 do
196 if [ $currentnode -eq $excludednode ]
197 then
198 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
199 continue 2
200 fi
201 done
202 # define requirement for submission
203 # FIXME: currently only for sge at isdc
204 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
205 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
206 noderequirementstat=" -l hostname=compute-0-${currentnode}"
207 # get number of jobs to be done from the DB
208 getstatus $currentnode >> $jmscriptlog 2>&1
209 else
210 noderequirementsub=""
211 noderequirementstat=""
212 # get number of jobs to be done from the DB
213 getstatus >> $jmscriptlog 2>&1
214 fi
215 # store the number of processes to be done for this script
216 todo[$i]=$numproc
217 tododb[$i]=$numproc
218
219 # FIXME: sge cuts scriptname to 8 digits in qstat
220 # number of idle jobs, i.e. jobs waiting in the queue to run
221 # condor: 1
222 # sge: q
223 # pbs: Q
224 q4=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus1\|"${scripts[$i]}"Jobstatusq\|"${scripts[$i]}"JobstatusQ\)` )
225 idlescript[$i]=${#q4[@]}
226
227 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
228 queuedscript[$i]=${#q2[@]}
229
230 stillfree[$i]=`echo "${maxjobs[$i]} - ${queuedscript[$i]} " | bc -l`
231
232 if [ $numproc -eq 0 ] || [ ${todo[$i]} -le ${idlescript[$i]} ] || [ ${maxjobs[$i]} -le ${queuedscript[$i]} ]
233 then
234 # store the fraction of cpus to add it to another process
235 idleratio=`echo " ${ratio[$i]} + $idleratio " | bc -l`
236 ratio[$i]=0
237 todo[$i]=0
238 idlenum=$i
239 continue
240 fi
241 done
242 echo `date +%F\ %T`" Evaluated scripts: "${scripts[@]} >> $jmscriptlog 2>&1
243 echo `date +%F\ %T`" Running scripts: "${queuedscript[@]}" (max: "${maxjobs[@]}")" >> $jmscriptlog 2>&1
244 echo `date +%F\ %T`" Number of jobs to be done (from DB): "${tododb[@]} >> $jmscriptlog 2>&1
245 echo `date +%F\ %T`" Number of jobs to be done (updated): "${todo[@]} >> $jmscriptlog 2>&1
246 echo `date +%F\ %T`" Ratio: "${ratio[@]}" (idle: "$idleratio")" >> $jmscriptlog 2>&1
247
248 # loop to update the ratio taking into account the ratio of
249 # a) steps where nothing has to done
250 # b) steps where already enough jobs are in the queue
251 # sum up this idle ratio
252 # determine for which step still most jobs have to be done
253 if ! [ "$idleratio" = "0" ]
254 then
255 addtoscript=
256 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
257 do
258 if [ ${todo[$i]} -gt ${todo[$idlenum]} ] && [ ${todo[$i]} -gt 0 ]
259 then
260 if ! [ "$addtoscript" = "" ]
261 then
262 if [ ${todo[$i]} -lt ${todo[$addtoscript]} ]
263 then
264 continue
265 fi
266 fi
267 addtoscript=$i
268 fi
269 done
270
271 # continue in case nothing has to be done for all steps
272 # else: update the ratio for the step where most jobs have to be done
273 # by adding the idle ratio
274 if [ "$addtoscript" = "" ]
275 then
276 echo `date +%F\ %T`" No jobs to be done for any step." >> $jmscriptlog 2>&1
277 sleepawhile "ok"
278 else
279 ratio[$addtoscript]=`echo " ${ratio[$addtoscript]} + $idleratio " | bc -l`
280 fi
281 fi
282 echo `date +%F\ %T`" Updated ratio: "${ratio[@]} >> $jmscriptlog 2>&1
283
284
285 # loop to submit jobs to queueing system
286 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
287 do
288 # calculate number of jobs to be submitted
289 tosubmit=`echo "scale=0; $tosubmittotal * ${ratio[$i]} / 1 " | bc -l`
290 if [ ${todo[$i]} -lt $tosubmit ]
291 then
292 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${todo[$i]} >> $jmscriptlog 2>&1
293 tosubmit=${todo[$i]}
294 fi
295 if [ $tosubmit -eq 0 ]
296 then
297 echo `date +%F\ %T`" No jobs to be submitted for script '"${scripts[$i]}"'" >> $jmscriptlog 2>&1
298 continue
299 fi
300 if [ $tosubmit -gt ${stillfree[$i]} ]
301 then
302 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${stillfree[$i]} >> $jmscriptlog 2>&1
303 tosubmit=${stillfree[$i]}
304 fi
305
306 # set the step to be evaluated
307 step=${scriptscolname[$i]}
308 # check if walltime has to be set
309 if [ "$setwalltime" = "yes" ]
310 then
311 walltime=${walltimes[$i]}
312 fi
313 # check if memory has to be set
314 if [ "$setpmem" = "yes" ]
315 then
316 pmem=${pmems[$i]}
317 fi
318
319 # submit $tosubmit scripts to queuing system
320 echo `date +%F\ %T`" Submitting "$tosubmit" jobs for script '"${scripts[$i]}"' to "$queuesys >> $jmscriptlog 2>&1
321 date=`date +%Y-%m-%d`
322 if ! queuesubmit 2>> $jmerrorlog
323 then
324 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
325 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
326 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
327 sleepawhile "error"
328 fi
329 done
330 sleepawhile "ok"
331done
332
Note: See TracBrowser for help on using the repository browser.