source: trunk/Mars/datacenter/scripts/jobmanager@ 10013

Last change on this file since 10013 was 10004, checked in by Daniela Dorner, 14 years ago
implemented usage of cluster in Dortmund (phido), i.e. usage of pbs
  • Property svn:executable set to *
File size: 10.2 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2010
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis)
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36# function to continue in loop and go to next script
37function nextscript()
38{
39 echo `date +%F\ %T`" sleeping \$$1 = $2 seconds... " >> $jmscriptlog 2>&1
40 sleep $2
41 echo "" >> $jmscriptlog 2>&1
42 continue
43}
44
45echo "" >> $jmscriptlog 2>&1
46echo "" >> $jmscriptlog 2>&1
47echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
48
49# choose commands according to queueing system (defined in setup)
50case $queuesys in
51 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
52 # (-hard) -l hostname=compute-*
53 # for qstat this returns the jobs running on that node + all jobs in the queue
54 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
55# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
56# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
57 # FIXME: get complete scriptname (including command line option), needed for runstereo
58 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
59 break
60 ;;
61 pbs) echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1
62 alias 'queuesubmit'='$pbspath/qsub -l walltime=$walltime -l pmem=$pmem -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
63 alias 'checkqueue'="$pbspath/qstat -a | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '"
64 break
65 ;;
66 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
67 alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
68 alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
69 break
70 ;;
71 *) echo "" >> $jmscriptlog 2>&1
72 finish >> $jmscriptlog 2>&1
73 ;;
74esac
75
76echo "" >> $jmscriptlog 2>&1
77
78prev=$max
79user=`whoami`
80currentnode=$minnode
81numevaluated=0
82# endless loop
83notcount=0
84nothingtodocount=0
85nothingtodosleeptime=0
86errorsleeptime=$errorsleeptimedefault
87while (( $notcount < 100 ))
88do
89 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
90 do
91 source `dirname $0`/sourcefile
92 echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
93
94 # check if there's something to do
95 step=${scriptscolname[$i]}
96 getstepinfo
97 # check if walltime has to be set
98 if [ "$setwalltime" = "yes" ]
99 then
100 walltime=${walltimes[$i]}
101 fi
102 # check if memory has to be set
103 if [ "$setpmem" = "yes" ]
104 then
105 pmem=${pmems[$i]}
106 fi
107 # check if the script is restricted to one node
108 # (i.e. where output of previous step(s) is stored)
109 # this information is taken from the steps.rc file
110 if [ "$noderestricted" = "yes" ]
111 then
112 # get number of next node
113 if [ $numevaluated -ge $numrestrictedscripts ]
114 then
115 currentnode=`echo $currentnode + 1 | bc -l`
116 numevaluated=1
117 else
118 numevaluated=`echo $numevaluated + 1 | bc -l`
119 fi
120 if [ $currentnode -gt $maxnode ]
121 then
122 currentnode=$minnode
123 fi
124 # check if node is excluded
125 for excludednode in ${excludednodes[@]}
126 do
127 if [ $currentnode -eq $excludednode ]
128 then
129 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
130 continue 2
131 fi
132 done
133 # define requirement for submission
134 # FIXME: currently only for sge at isdc
135 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
136 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
137 noderequirementstat=" -l hostname=compute-0-${currentnode}"
138 getstatus $currentnode >> $jmscriptlog 2>&1
139 else
140 noderequirementsub=""
141 noderequirementstat=""
142 getstatus >> $jmscriptlog 2>&1
143 fi
144
145 # check number of processes to be done
146 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/step $db/$step]" >> $jmscriptlog 2>&1
147 if [ "$numproc" = "0" ]
148 then
149 prev=0
150 nothingtodocount=`expr $nothingtodocount + 1`
151 if [ $nothingtodocount -lt ${#scripts[@]} ]
152 then
153 nextscript 0 0
154 else
155 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
156 then
157 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
158 fi
159 nextscript nothingtodosleeptime $nothingtodosleeptime
160 fi
161 else
162 nothingtodocount=0
163 nothingtodosleeptime=0
164 fi
165
166 # get processes in queue
167 q=(`checkqueue 2>&1 `)
168 if echo $q | egrep \(Error\|failed\)
169 then
170 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
171 printprocesslog "WARN checking query ($queuesys) failed"
172 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
173 nextscript sleeptime $sleeptime
174 fi
175 # FIXME: sge cuts scriptname to 8 digits in qstat
176 # get processes of user in queue
177 q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
178 queued=${#q1[@]}
179 # get scripts in queue
180 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
181 queuedscript=${#q2[@]}
182 # get running scripts
183 # condor: 2
184 # sge: r
185 # pbs: R
186 q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\|"${scripts[$i]}"JobstatusR\)` )
187 runningscript=${#q3[@]}
188 stillinqueue=`echo $queuedscript - $runningscript | bc `
189
190 #get total number of allowed process for current time
191 hour=`date +%k`
192 if [ ${pnototal[$hour]} -lt $totalmax ]
193 then
194 totalpno=${pnototal[$hour]}
195 else
196 totalpno=$totalmax
197 fi
198
199 #choose array according to the day of the week
200 dayofweek=`date +%u`
201 case $dayofweek in
202 0 | 6) pnos=( ${pnoswe[@]} ) ;;
203 *) pnos=( ${pnosweek[@]} ) ;;
204 esac
205 # get number of allowed scripts for current time
206 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
207 pnoscript=${pnos[$num]}
208 # if there was nothing to do for previous script, more scripts can be allowed
209 if [ $prev -eq 0 ]
210 then
211 echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
212 pnoscript=$max
213 fi
214 echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
215 echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
216
217 # continue if there are already enough processes or scripts in the queue
218 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
219 then
220 nextscript sleeptime $sleeptime
221 fi
222 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
223 if [ $numproc -le $stillinqueue ]
224 then
225 echo `date +%F\ %T`" \$numproc ($numproc) < \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
226 nextscript sleeptime $sleeptime
227 fi
228
229 # reset prev
230 prev=$max
231
232 # submit 1 script to queuing system
233 date=`date +%Y-%m-%d`
234 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
235 if ! queuesubmit 2>> $jmerrorlog
236 then
237 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
238 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
239 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
240 if [ $errorsleeptime -lt $sleeptimelimit ]
241 then
242 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
243 fi
244 nextscript errorsleeptime $errorsleeptime
245 else
246 errorsleeptime=$errorsleeptimedefault
247 fi
248 nextscript sleeptime $sleeptime
249 done
250done
251
Note: See TracBrowser for help on using the repository browser.