source: trunk/DataCheck/Processing/JobManager.sh@ 13369

Last change on this file since 13369 was 13050, checked in by Daniela Dorner, 13 years ago
added (jobmanager to run callisto and star on a computing cluster
  • Property svn:executable set to *
File size: 12.4 KB
Line 
1#!/bin/bash
2#
3# This a script, which launches other scripts (all scripts, that are run
4# on primary basis)
5#
6
7source `dirname $0`/../Sourcefile.sh
8printprocesslog "INFO starting $0"
9
10set -C
11shopt -s expand_aliases
12
13# function to continue in loop and go to next script
14function sleepawhile()
15{
16 usedsleeptime=$sleeptime
17 case $1 in
18 "error") if ! [ "$errorsleeptime" = "" ]
19 then
20 if [ $errorsleeptime -lt $sleeptimelimit ]
21 then
22 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
23 fi
24 usedsleeptime=$errorsleeptime
25 fi
26 ;;
27 "ok") errorsleeptime=$errorsleeptimedefault
28 ;;
29 esac
30 echo `date +%F\ %T`" sleeping "$usedsleeptime" seconds... (status: "$1")" >> $jmscriptlog 2>&1
31 echo "" >> $jmscriptlog 2>&1
32 sleep $usedsleeptime
33 continue
34}
35
36echo "" >> $jmscriptlog 2>&1
37echo "" >> $jmscriptlog 2>&1
38echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
39
40user=`whoami`
41
42# choose commands according to queueing system (defined in setup)
43case $queuesys in
44 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
45 # (-hard) -l hostname=compute-*
46 # for qstat this returns the jobs running on that node + all jobs in the queue
47 alias 'queuesubmit'='$sgepath/qsub -b y -q fact_long -t 1-`echo $tosubmit` -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` -N `echo $step` `echo $scriptspath`/`echo ${scripts[$i]}` '
48# alias 'queuesubmit'='$sgepath/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
49# alias 'queuesubmit'='$sgepath/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
50 # FIXME: get complete scriptname (including command line option), needed for runstereo
51 alias 'checkqueue'="$sgepath/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
52 #break
53 ;;
54# pbs) echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1
55# alias 'queuesubmit'='$pbspath/qsub -t 1-`echo $tosubmit` -l walltime=`echo $walltime` -l pmem=`echo $pmem` -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH,SCRIPTNAME=`echo ${scripts[$i]}` -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
56# # check queue (restricted to current user only)
57# alias 'checkqueue'="$pbspath/qstat -a -u $user | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '"
58# #break
59# ;;
60# condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
61# alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a num=`echo $tosubmit` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
62# alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
63# #break
64# ;;
65 *) echo "" >> $jmscriptlog 2>&1
66 finish >> $jmscriptlog 2>&1
67 ;;
68esac
69
70echo "" >> $jmscriptlog 2>&1
71
72# for processing with local storage on different nodes
73currentnode=$minnode
74numevaluated=0
75
76# endless loop
77notcount=0
78errorsleeptime=$errorsleeptimedefault
79while (( $notcount < 100 ))
80do
81 # get and set some information for the processing
82 source `dirname $0`/../Sourcefile.sh
83 # reset some values
84 tosubmit=0
85 idleratio=0
86 addtoscript=
87
88 # get processes in queue
89 q=(`checkqueue 2>&1 `)
90 if echo $q | egrep \(Error\|failed\)
91 then
92 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
93 printprocesslog "WARN checking query ($queuesys) failed"
94 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
95 sleepawhile "error"
96 fi
97
98 # general check whether one should submit something depending on chosen algorithm
99 # algorithm 1:
100 # submit new jobs in case there are less than $limitidle idle jobs in the queue
101 # algorithm 2:
102 # submit new jobs in case the total number of jobs in the queue has fallen below $totalpno
103 case $algorithm in
104 1) # algorithm 1
105 # get number of idle jobs in the queue
106 q5=( `echo ${q[@]} | egrep -o \(Jobstatus1\|Jobstatusq\|JobstatusQ\)` )
107 idle=${#q5[@]}
108 if [ $idle -gt $limitidle ]
109 then
110 echo `date +%F\ %T`" more than "$limitidle" jobs waiting ("$idle")" >> $jmscriptlog 2>&1
111 sleepawhile "ok"
112 fi
113 ;;
114 2) # algorithm 2
115 # get processes of user in queue
116 q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
117 queued=${#q1[@]}
118 hour=`date +%k`
119 # choose array of total number of jobs to be done
120 # according to the day of the week
121 dayofweek=`date +%u`
122 case $dayofweek in
123 0 | 6) totalpno=${pnototal[$hour]} ;;
124 *) totalpno=${pnototalwe[$hour]} ;;
125 esac
126 # get total number of jobs to be submitted
127 if [ $queued -gt $totalpno ]
128 then
129 echo `date +%F\ %T`" more than "$totalpno" jobs waiting ("$queued")" >> $jmscriptlog 2>&1
130 sleepawhile "ok"
131 else
132 tosubmittotal=`echo "$totalpno - $queued" | bc -l`
133 fi
134 ;;
135 *) echo "Please give an algorithm to calculate the number of allowed jobs."
136 exit
137 ;;
138 esac
139 echo `date +%F\ %T`" Total number of jobs to be submitted: "$tosubmittotal >> $jmscriptlog 2>&1
140
141 # first loop to determine
142 # a) how many jobs of this script have to be done
143 # b) how many jobs of this script are running or queued
144 todo=()
145 tododb=()
146 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
147 do
148 # set the step to be evaluated
149 step=${scriptscolname[$i]}
150 getstepinfo
151
152 # check if the script is restricted to one node
153 # (i.e. where output of previous step(s) is stored)
154 # this information is taken from the steps.rc file
155 # currently this is implemented for sge only
156 # then get number of jobs to be done
157 if [ "$noderestricted" = "yes" ]
158 then
159 # get number of next node
160 if [ $numevaluated -ge $numrestrictedscripts ]
161 then
162 currentnode=`echo " $currentnode + 1 " | bc -l`
163 numevaluated=1
164 else
165 numevaluated=`echo " $numevaluated + 1 " | bc -l`
166 fi
167 if [ $currentnode -gt $maxnode ]
168 then
169 currentnode=$minnode
170 fi
171 # check if node is excluded
172 for excludednode in ${excludednodes[@]}
173 do
174 if [ $currentnode -eq $excludednode ]
175 then
176 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
177 continue 2
178 fi
179 done
180 # define requirement for submission
181 # FIXME: currently only for sge at isdc
182 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
183 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
184 noderequirementstat=" -l hostname=compute-0-${currentnode}"
185 # get number of jobs to be done from the DB
186 getstatus $currentnode >> $jmscriptlog 2>&1
187 else
188 noderequirementsub=""
189 noderequirementstat=""
190 # get number of jobs to be done from the DB
191 getstatus >> $jmscriptlog 2>&1
192 fi
193 # store the number of processes to be done for this script
194 todo[$i]=$numproc
195 tododb[$i]=$numproc
196
197 # FIXME: sge cuts scriptname to 8 digits in qstat
198 # number of idle jobs, i.e. jobs waiting in the queue to run
199 # condor: 1
200 # sge: q
201 # pbs: Q
202 #q4=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus1\|"${scripts[$i]}"Jobstatusq\|"${scripts[$i]}"JobstatusQ\)` )
203 q4=( `echo ${q[@]} | egrep -o \("$step"Jobstatus1\|"$step"Jobstatusq\|"$step"JobstatusQ\)` )
204 idlescript[$i]=${#q4[@]}
205
206 #q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
207 q2=( `echo ${q[@]} | egrep -o "$step"`)
208 queuedscript[$i]=${#q2[@]}
209
210 stillfree[$i]=`echo "${maxjobs[$i]} - ${queuedscript[$i]} " | bc -l`
211
212 if [ $numproc -eq 0 ] || [ ${todo[$i]} -le ${idlescript[$i]} ] || [ ${maxjobs[$i]} -le ${queuedscript[$i]} ]
213 then
214 # store the fraction of cpus to add it to another process
215 idleratio=`echo " ${ratio[$i]} + $idleratio " | bc -l`
216 ratio[$i]=0
217 todo[$i]=0
218 idlenum=$i
219 continue
220 fi
221 done
222 echo `date +%F\ %T`" Evaluated scripts: "${scripts[@]} >> $jmscriptlog 2>&1
223 echo `date +%F\ %T`" Running scripts: "${queuedscript[@]}" (max: "${maxjobs[@]}")" >> $jmscriptlog 2>&1
224 echo `date +%F\ %T`" Number of jobs to be done (from DB): "${tododb[@]} >> $jmscriptlog 2>&1
225 echo `date +%F\ %T`" Number of jobs to be done (updated): "${todo[@]} >> $jmscriptlog 2>&1
226 echo `date +%F\ %T`" Ratio: "${ratio[@]}" (idle: "$idleratio")" >> $jmscriptlog 2>&1
227
228 # loop to update the ratio taking into account the ratio of
229 # a) steps where nothing has to done
230 # b) steps where already enough jobs are in the queue
231 # sum up this idle ratio
232 # determine for which step still most jobs have to be done
233 if ! [ "$idleratio" = "0" ]
234 then
235 addtoscript=
236 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
237 do
238 if [ ${todo[$i]} -gt ${todo[$idlenum]} ] && [ ${todo[$i]} -gt 0 ]
239 then
240 if ! [ "$addtoscript" = "" ]
241 then
242 if [ ${todo[$i]} -lt ${todo[$addtoscript]} ]
243 then
244 continue
245 fi
246 fi
247 addtoscript=$i
248 fi
249 done
250
251 # continue in case nothing has to be done for all steps
252 # else: update the ratio for the step where most jobs have to be done
253 # by adding the idle ratio
254 if [ "$addtoscript" = "" ]
255 then
256 echo `date +%F\ %T`" No jobs to be done for any step." >> $jmscriptlog 2>&1
257 sleepawhile "ok"
258 else
259 ratio[$addtoscript]=`echo " ${ratio[$addtoscript]} + $idleratio " | bc -l`
260 fi
261 fi
262 echo `date +%F\ %T`" Updated ratio: "${ratio[@]} >> $jmscriptlog 2>&1
263
264
265 # loop to submit jobs to queueing system
266 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
267 do
268 # calculate number of jobs to be submitted
269 tosubmit=`echo "scale=0; $tosubmittotal * ${ratio[$i]} / 1 " | bc -l`
270 if [ ${todo[$i]} -lt $tosubmit ]
271 then
272 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${todo[$i]} >> $jmscriptlog 2>&1
273 tosubmit=${todo[$i]}
274 fi
275 if [ $tosubmit -eq 0 ]
276 then
277 echo `date +%F\ %T`" No jobs to be submitted for script '"${scripts[$i]}"'" >> $jmscriptlog 2>&1
278 continue
279 fi
280 if [ $tosubmit -gt ${stillfree[$i]} ]
281 then
282 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${stillfree[$i]} >> $jmscriptlog 2>&1
283 tosubmit=${stillfree[$i]}
284 fi
285
286 # set the step to be evaluated
287 step=${scriptscolname[$i]}
288 # check if walltime has to be set
289 if [ "$setwalltime" = "yes" ]
290 then
291 walltime=${walltimes[$i]}
292 fi
293 # check if memory has to be set
294 if [ "$setpmem" = "yes" ]
295 then
296 pmem=${pmems[$i]}
297 fi
298
299 # submit $tosubmit scripts to queuing system
300 #tosubmit=1 #workaround for test on fact cluster
301 echo `date +%F\ %T`" Submitting "$tosubmit" jobs for script '"${scripts[$i]}"' to "$queuesys >> $jmscriptlog 2>&1
302 date=`date +%Y-%m-%d`
303 if ! queuesubmit 2>> $jmerrorlog
304 then
305 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
306 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
307 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
308 sleepawhile "error"
309 fi
310 done
311 sleepawhile "ok"
312done
313
Note: See TracBrowser for help on using the repository browser.