source: trunk/DataCheck/Processing/JobManager.sh@ 20037

Last change on this file since 20037 was 18764, checked in by Daniela Dorner, 8 years ago
rewrote submitting part - now primaries are given in submit to avoid too many queries at a time
  • Property svn:executable set to *
File size: 13.5 KB
Line 
1#!/bin/bash
2#
3# This a script, which launches other scripts (all scripts, that are run
4# on primary basis)
5#
6
7source `dirname $0`/../Sourcefile.sh
8printprocesslog "INFO starting $0"
9
10set -C
11shopt -s expand_aliases
12
13# function to continue in loop and go to next script
14function sleepawhile()
15{
16 usedsleeptime=$sleeptime
17 case $1 in
18 "error") if ! [ "$errorsleeptime" = "" ]
19 then
20 if [ $errorsleeptime -lt $sleeptimelimit ]
21 then
22 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
23 fi
24 usedsleeptime=$errorsleeptime
25 fi
26 ;;
27 "ok") errorsleeptime=$errorsleeptimedefault
28 ;;
29 esac
30 echo `date +%F\ %T`" sleeping "$usedsleeptime" seconds... (status: "$1")" >> $jmscriptlog 2>&1
31 echo "" >> $jmscriptlog 2>&1
32 sleep $usedsleeptime
33 continue
34}
35
36echo "" >> $jmscriptlog 2>&1
37echo "" >> $jmscriptlog 2>&1
38echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
39
40user=`whoami`
41
42# choose commands according to queueing system (defined in setup)
43case $queuesys in
44 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
45 # (-hard) -l hostname=compute-*
46 # for qstat this returns the jobs running on that node + all jobs in the queue
47 alias 'queuesubmit'='$sgepath/qsub -b y -q `echo ${queues[$i]}` -t 1-`echo $tosubmit` -v AUTOMATIONSETUP=$AUTOMATIONSETUP -v night=$night -v seqid=$seqid -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` -N `echo $step` `echo $scriptspath`/`echo ${scripts[$i]}` '
48 #alias 'queuesubmit'='$sgepath/qsub -b y -q fact_long -t 1-`echo $tosubmit` -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` -N `echo $step` `echo $scriptspath`/`echo ${scripts[$i]}` '
49# alias 'queuesubmit'='$sgepath/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
50# alias 'queuesubmit'='$sgepath/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
51 # FIXME: get complete scriptname (including command line option), needed for runstereo
52 alias 'checkqueue'="$sgepath/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
53 #break
54 ;;
55# pbs) echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1
56# alias 'queuesubmit'='$pbspath/qsub -t 1-`echo $tosubmit` -l walltime=`echo $walltime` -l pmem=`echo $pmem` -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH,SCRIPTNAME=`echo ${scripts[$i]}` -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
57# # check queue (restricted to current user only)
58# alias 'checkqueue'="$pbspath/qstat -a -u $user | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '"
59# #break
60# ;;
61# condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
62# alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a num=`echo $tosubmit` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
63# alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
64# #break
65# ;;
66 *) echo "" >> $jmscriptlog 2>&1
67 finish >> $jmscriptlog 2>&1
68 ;;
69esac
70
71echo "" >> $jmscriptlog 2>&1
72
73# for processing with local storage on different nodes
74currentnode=$minnode
75numevaluated=0
76
77# endless loop
78notcount=0
79errorsleeptime=$errorsleeptimedefault
80while (( $notcount < 100 ))
81do
82 # get and set some information for the processing
83 source `dirname $0`/../Sourcefile.sh
84 # reset some values
85 tosubmit=0
86 idleratio=0
87 addtoscript=
88
89 # get processes in queue
90 q=(`checkqueue 2>&1 `)
91 if echo $q | egrep \(Error\|failed\)
92 then
93 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
94 printprocesslog "WARN checking query ($queuesys) failed"
95 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
96 sleepawhile "error"
97 fi
98
99 # general check whether one should submit something depending on chosen algorithm
100 # algorithm 1:
101 # submit new jobs in case there are less than $limitidle idle jobs in the queue
102 # algorithm 2:
103 # submit new jobs in case the total number of jobs in the queue has fallen below $totalpno
104 case $algorithm in
105 1) # algorithm 1
106 # get number of idle jobs in the queue
107 q5=( `echo ${q[@]} | egrep -o \(Jobstatus1\|Jobstatusq\|JobstatusQ\)` )
108 idle=${#q5[@]}
109 if [ $idle -gt $limitidle ]
110 then
111 echo `date +%F\ %T`" more than "$limitidle" jobs waiting ("$idle")" >> $jmscriptlog 2>&1
112 sleepawhile "ok"
113 fi
114 ;;
115 2) # algorithm 2
116 # get processes of user in queue
117 q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
118 queued=${#q1[@]}
119 hour=`date +%k`
120 # choose array of total number of jobs to be done
121 # according to the day of the week
122 dayofweek=`date +%u`
123 case $dayofweek in
124 0 | 6) totalpno=${pnototal[$hour]} ;;
125 *) totalpno=${pnototalwe[$hour]} ;;
126 esac
127 # get total number of jobs to be submitted
128 if [ $queued -gt $totalpno ]
129 then
130 echo `date +%F\ %T`" more than "$totalpno" jobs waiting ("$queued")" >> $jmscriptlog 2>&1
131 sleepawhile "ok"
132 else
133 tosubmittotal=`echo "$totalpno - $queued" | bc -l`
134 fi
135 ;;
136 *) echo "Please give an algorithm to calculate the number of allowed jobs."
137 exit
138 ;;
139 esac
140 echo `date +%F\ %T`" Total number of jobs to be submitted: "$tosubmittotal >> $jmscriptlog 2>&1
141
142 # first loop to determine
143 # a) how many jobs of this script have to be done
144 # b) how many jobs of this script are running or queued
145 todo=()
146 tododb=()
147 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
148 do
149 # set the step to be evaluated
150 step=${scriptscolname[$i]}
151 getstepinfo
152
153 # check if the script is restricted to one node
154 # (i.e. where output of previous step(s) is stored)
155 # this information is taken from the steps.rc file
156 # currently this is implemented for sge only
157 # then get number of jobs to be done
158 if [ "$noderestricted" = "yes" ]
159 then
160 # get number of next node
161 if [ $numevaluated -ge $numrestrictedscripts ]
162 then
163 currentnode=`echo " $currentnode + 1 " | bc -l`
164 numevaluated=1
165 else
166 numevaluated=`echo " $numevaluated + 1 " | bc -l`
167 fi
168 if [ $currentnode -gt $maxnode ]
169 then
170 currentnode=$minnode
171 fi
172 # check if node is excluded
173 for excludednode in ${excludednodes[@]}
174 do
175 if [ $currentnode -eq $excludednode ]
176 then
177 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
178 continue 2
179 fi
180 done
181 # define requirement for submission
182 # FIXME: currently only for sge at isdc
183 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
184 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
185 noderequirementstat=" -l hostname=compute-0-${currentnode}"
186 # get number of jobs to be done from the DB
187 getstatus $currentnode >> $jmscriptlog 2>&1
188 else
189 noderequirementsub=""
190 noderequirementstat=""
191 # get number of jobs to be done from the DB
192 getstatus >> $jmscriptlog 2>&1
193 fi
194 # store the number of processes to be done for this script
195 todo[$i]=$numproc
196 tododb[$i]=$numproc
197
198 # FIXME: sge cuts scriptname to 8 digits in qstat
199 # number of idle jobs, i.e. jobs waiting in the queue to run
200 # condor: 1
201 # sge: q
202 # pbs: Q
203 #q4=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus1\|"${scripts[$i]}"Jobstatusq\|"${scripts[$i]}"JobstatusQ\)` )
204 q4=( `echo ${q[@]} | egrep -o \("$step"Jobstatus1\|"$step"Jobstatusq\|"$step"JobstatusQ\)` )
205 idlescript[$i]=${#q4[@]}
206
207 #q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
208 q2=( `echo ${q[@]} | egrep -o "$step"`)
209 queuedscript[$i]=${#q2[@]}
210
211 stillfree[$i]=`echo "${maxjobs[$i]} - ${queuedscript[$i]} " | bc -l`
212
213 if [ $numproc -eq 0 ] || [ ${todo[$i]} -le ${idlescript[$i]} ] || [ ${maxjobs[$i]} -le ${queuedscript[$i]} ]
214 then
215 # store the fraction of cpus to add it to another process
216 idleratio=`echo " ${ratio[$i]} + $idleratio " | bc -l`
217 ratio[$i]=0
218 todo[$i]=0
219 idlenum=$i
220 continue
221 fi
222 done
223 echo `date +%F\ %T`" Evaluated scripts: "${scripts[@]} >> $jmscriptlog 2>&1
224 echo `date +%F\ %T`" Running scripts: "${queuedscript[@]}" (max: "${maxjobs[@]}")" >> $jmscriptlog 2>&1
225 echo `date +%F\ %T`" Number of jobs to be done (from DB): "${tododb[@]} >> $jmscriptlog 2>&1
226 echo `date +%F\ %T`" Number of jobs to be done (updated): "${todo[@]} >> $jmscriptlog 2>&1
227 echo `date +%F\ %T`" Ratio: "${ratio[@]}" (idle: "$idleratio")" >> $jmscriptlog 2>&1
228
229 # loop to update the ratio taking into account the ratio of
230 # a) steps where nothing has to done
231 # b) steps where already enough jobs are in the queue
232 # sum up this idle ratio
233 # determine for which step still most jobs have to be done
234 if ! [ "$idleratio" = "0" ]
235 then
236 addtoscript=
237 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
238 do
239 if [ ${todo[$i]} -gt ${todo[$idlenum]} ] && [ ${todo[$i]} -gt 0 ]
240 then
241 if ! [ "$addtoscript" = "" ]
242 then
243 if [ ${todo[$i]} -lt ${todo[$addtoscript]} ]
244 then
245 continue
246 fi
247 fi
248 addtoscript=$i
249 fi
250 done
251
252 # continue in case nothing has to be done for all steps
253 # else: update the ratio for the step where most jobs have to be done
254 # by adding the idle ratio
255 if [ "$addtoscript" = "" ]
256 then
257 echo `date +%F\ %T`" No jobs to be done for any step." >> $jmscriptlog 2>&1
258 sleepawhile "ok"
259 else
260 ratio[$addtoscript]=`echo " ${ratio[$addtoscript]} + $idleratio " | bc -l`
261 fi
262 fi
263 echo `date +%F\ %T`" Updated ratio: "${ratio[@]} >> $jmscriptlog 2>&1
264
265
266 # loop to submit jobs to queueing system
267 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
268 do
269 # calculate number of jobs to be submitted
270 tosubmit=`echo "scale=0; $tosubmittotal * ${ratio[$i]} / 1 " | bc -l`
271 if [ ${todo[$i]} -lt $tosubmit ]
272 then
273 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${todo[$i]} >> $jmscriptlog 2>&1
274 tosubmit=${todo[$i]}
275 fi
276 if [ $tosubmit -eq 0 ]
277 then
278 echo `date +%F\ %T`" No jobs to be submitted for script '"${scripts[$i]}"'" >> $jmscriptlog 2>&1
279 continue
280 fi
281 if [ $tosubmit -gt ${stillfree[$i]} ]
282 then
283 echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${stillfree[$i]} >> $jmscriptlog 2>&1
284 tosubmit=${stillfree[$i]}
285 fi
286
287 # set the step to be evaluated
288 step=${scriptscolname[$i]}
289 # check if walltime has to be set
290 if [ "$setwalltime" = "yes" ]
291 then
292 walltime=${walltimes[$i]}
293 fi
294 # check if memory has to be set
295 if [ "$setpmem" = "yes" ]
296 then
297 pmem=${pmems[$i]}
298 fi
299
300# submitting individual job now giving night and seqid
301# # set $tosubmit to 1 if something is 'qw', because in this case
302# # only one line for several jobs might be shown
303# if [ ${idlescript[$i]} -gt 0 ] # && [ "$step" == "Callisto" ]
304# then
305# tosubmit=1
306# fi
307 #echo "tosubmit: "$tosubmit
308
309 # get todo list
310 gettodo
311 # setting upper limit for loop
312 if [ $num -gt $tosubmit ]
313 then
314 num=$tosubmit
315 fi
316
317 for (( s=0 ; s < $num ; s++ ))
318 do
319 tosubmit=1
320 night=${primaries[$s+$s]}
321 seqid=${primaries[$s+$s+1]}
322
323 echo "submit "${scripts[$i]}" "$night" "$seqid
324
325 # set status in DB
326 setstatus "startj"
327
328 # submit $tosubmit scripts to queuing system
329 echo `date +%F\ %T`" Submitting ("$s"/"$tosubmit") jobs for script '"${scripts[$i]}"' (night:"$night" seq"$seqid") to "$queuesys >> $jmscriptlog 2>&1
330 date=`date +%Y-%m-%d`
331 if ! queuesubmit 2>> $jmerrorlog
332 then
333 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
334 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
335 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
336 sleepawhile "error"
337 fi
338 done
339 echo ""
340 done
341 sleepawhile "ok"
342done
343
Note: See TracBrowser for help on using the repository browser.