source: trunk/Mars/datacenter/scripts/jobmanager@ 10018

Last change on this file since 10018 was 10014, checked in by Daniela Dorner, 15 years ago
  • Property svn:executable set to *
File size: 10.3 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2010
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis)
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36# function to continue in loop and go to next script
37function nextscript()
38{
39 echo `date +%F\ %T`" sleeping \$$1 = $2 seconds... " >> $jmscriptlog 2>&1
40 sleep $2
41 echo "" >> $jmscriptlog 2>&1
42 continue
43}
44
45echo "" >> $jmscriptlog 2>&1
46echo "" >> $jmscriptlog 2>&1
47echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
48
49user=`whoami`
50
51# choose commands according to queueing system (defined in setup)
52case $queuesys in
53 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
54 # (-hard) -l hostname=compute-*
55 # for qstat this returns the jobs running on that node + all jobs in the queue
56 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
57# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
58# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
59 # FIXME: get complete scriptname (including command line option), needed for runstereo
60 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
61 break
62 ;;
63 pbs) echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1
64 alias 'queuesubmit'='$pbspath/qsub -l walltime=$walltime -l pmem=$pmem -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
65 # check queue (restricted to current user only)
66 alias 'checkqueue'="$pbspath/qstat -a -u $user | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '"
67 break
68 ;;
69 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
70 alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
71 alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
72 break
73 ;;
74 *) echo "" >> $jmscriptlog 2>&1
75 finish >> $jmscriptlog 2>&1
76 ;;
77esac
78
79echo "" >> $jmscriptlog 2>&1
80
81prev=$max
82currentnode=$minnode
83numevaluated=0
84# endless loop
85notcount=0
86nothingtodocount=0
87nothingtodosleeptime=0
88errorsleeptime=$errorsleeptimedefault
89while (( $notcount < 100 ))
90do
91 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
92 do
93 source `dirname $0`/sourcefile
94 echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
95
96 # check if there's something to do
97 step=${scriptscolname[$i]}
98 getstepinfo
99 # check if walltime has to be set
100 if [ "$setwalltime" = "yes" ]
101 then
102 walltime=${walltimes[$i]}
103 fi
104 # check if memory has to be set
105 if [ "$setpmem" = "yes" ]
106 then
107 pmem=${pmems[$i]}
108 fi
109 # check if the script is restricted to one node
110 # (i.e. where output of previous step(s) is stored)
111 # this information is taken from the steps.rc file
112 if [ "$noderestricted" = "yes" ]
113 then
114 # get number of next node
115 if [ $numevaluated -ge $numrestrictedscripts ]
116 then
117 currentnode=`echo $currentnode + 1 | bc -l`
118 numevaluated=1
119 else
120 numevaluated=`echo $numevaluated + 1 | bc -l`
121 fi
122 if [ $currentnode -gt $maxnode ]
123 then
124 currentnode=$minnode
125 fi
126 # check if node is excluded
127 for excludednode in ${excludednodes[@]}
128 do
129 if [ $currentnode -eq $excludednode ]
130 then
131 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
132 continue 2
133 fi
134 done
135 # define requirement for submission
136 # FIXME: currently only for sge at isdc
137 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
138 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
139 noderequirementstat=" -l hostname=compute-0-${currentnode}"
140 getstatus $currentnode >> $jmscriptlog 2>&1
141 else
142 noderequirementsub=""
143 noderequirementstat=""
144 getstatus >> $jmscriptlog 2>&1
145 fi
146
147 # check number of processes to be done
148 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/step $db/$step]" >> $jmscriptlog 2>&1
149 if [ "$numproc" = "0" ]
150 then
151 prev=0
152 nothingtodocount=`expr $nothingtodocount + 1`
153 if [ $nothingtodocount -lt ${#scripts[@]} ]
154 then
155 nextscript 0 0
156 else
157 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
158 then
159 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
160 fi
161 nextscript nothingtodosleeptime $nothingtodosleeptime
162 fi
163 else
164 nothingtodocount=0
165 nothingtodosleeptime=0
166 fi
167
168 # get processes in queue
169 q=(`checkqueue 2>&1 `)
170 if echo $q | egrep \(Error\|failed\)
171 then
172 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
173 printprocesslog "WARN checking query ($queuesys) failed"
174 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
175 nextscript sleeptime $sleeptime
176 fi
177 # FIXME: sge cuts scriptname to 8 digits in qstat
178 # get processes of user in queue
179 q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
180 queued=${#q1[@]}
181 # get number of scripts in queue
182 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
183 queuedscript=${#q2[@]}
184 # get running scripts
185 # condor: 2
186 # sge: r
187 # pbs: R
188 q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\|"${scripts[$i]}"JobstatusR\)` )
189 runningscript=${#q3[@]}
190 stillinqueue=`echo $queuedscript - $runningscript | bc `
191
192 #get total number of allowed process for current time
193 hour=`date +%k`
194 if [ ${pnototal[$hour]} -lt $totalmax ]
195 then
196 totalpno=${pnototal[$hour]}
197 else
198 totalpno=$totalmax
199 fi
200
201 #choose array according to the day of the week
202 dayofweek=`date +%u`
203 case $dayofweek in
204 0 | 6) pnos=( ${pnoswe[@]} ) ;;
205 *) pnos=( ${pnosweek[@]} ) ;;
206 esac
207 # get number of allowed scripts for current time
208 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
209 pnoscript=${pnos[$num]}
210 # if there was nothing to do for previous script, more scripts can be allowed
211 if [ $prev -eq 0 ]
212 then
213 echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
214 pnoscript=$max
215 fi
216 echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
217 echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
218
219 # continue if there are already enough processes or scripts in the queue
220 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
221 then
222 nextscript sleeptime $sleeptime
223 fi
224 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
225 if [ $numproc -le $stillinqueue ]
226 then
227 echo `date +%F\ %T`" \$numproc ($numproc) < \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
228 nextscript sleeptime $sleeptime
229 fi
230
231 # reset prev
232 prev=$max
233
234 # submit 1 script to queuing system
235 date=`date +%Y-%m-%d`
236 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
237 if ! queuesubmit 2>> $jmerrorlog
238 then
239 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
240 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
241 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
242 if [ $errorsleeptime -lt $sleeptimelimit ]
243 then
244 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
245 fi
246 nextscript errorsleeptime $errorsleeptime
247 else
248 errorsleeptime=$errorsleeptimedefault
249 fi
250 nextscript sleeptime $sleeptime
251 done
252done
253
Note: See TracBrowser for help on using the repository browser.