source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 9504

Last change on this file since 9504 was 9493, checked in by Daniela Dorner, 15 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 9.2 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2009
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36# function to continue in loop and go to next script
37function nextscript()
38{
39 echo `date +%F\ %T`" sleeping \$$1 = $sleeptime seconds... " >> $jmscriptlog 2>&1
40 sleep $2
41 echo "" >> $jmscriptlog 2>&1
42 continue
43}
44
45echo "" >> $jmscriptlog 2>&1
46echo "" >> $jmscriptlog 2>&1
47echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
48
49# choose commands according to queueing system (defined in setup)
50case $queuesys in
51 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
52 # (-hard) -l hostname=compute-*
53 # for qstat this returns the jobs running on that node + all jobs in the queue
54 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
55# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
56# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
57 # FIXME: get complete scriptname (including command line option), needed for runstereo
58 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
59 break
60 ;;
61 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
62 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
63 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
64 break
65 ;;
66 *) echo "" >> $jmscriptlog 2>&1
67 finish >> $jmscriptlog 2>&1
68 ;;
69esac
70
71echo "" >> $jmscriptlog 2>&1
72
73prev=$max
74user=`whoami`
75currentnode=$minnode
76numevaluated=0
77# endless loop
78notcount=0
79nothingtodocount=0
80nothingtodosleeptime=0
81errorsleeptime=$errorsleeptimedefault
82while (( $notcount < 100 ))
83do
84 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
85 do
86 source `dirname $0`/sourcefile
87 echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
88
89 # check if there's something to do
90 column=${scriptscolname[$i]}
91 getstepinfo
92 if [ "$noderestricted" = "yes" ]
93 then
94 # get number of next node
95 if [ $numevaluated -ge $numrestrictedscripts ]
96 then
97 currentnode=`echo $currentnode + 1 | bc -l`
98 numevaluated=1
99 else
100 numevaluated=`echo $numevaluated + 1 | bc -l`
101 fi
102 if [ $currentnode -gt $maxnode ]
103 then
104 currentnode=$minnode
105 fi
106 # check if node is excluded
107 for excludednode in ${excludednodes[@]}
108 do
109 if [ $currentnode -eq $excludednode ]
110 then
111 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
112 continue 2
113 fi
114 done
115 # define requirement for submission
116 # FIXME: currently only for sge at isdc
117 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
118 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
119 noderequirementstat=" -l hostname=compute-0-${currentnode}"
120 getstatus $currentnode >> $jmscriptlog 2>&1
121 else
122 noderequirementsub=""
123 noderequirementstat=""
124 getstatus >> $jmscriptlog 2>&1
125 fi
126
127 # check number of processes to be done
128 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1
129 if [ "$numproc" = "" ]
130 then
131 prev=0
132 nothingtodocount=`expr $nothingtodocount + 1`
133 if [ $nothingtodocount -lt ${#scripts[@]} ]
134 then
135 nextscript 0 0
136 else
137 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
138 then
139 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
140 fi
141 nextscript nothingtodosleeptime $nothingtodosleeptime
142 fi
143 else
144 nothingtodocount=0
145 nothingtodosleeptime=0
146 fi
147
148 # get processes in queue
149 q=(`checkqueue 2>&1 `)
150 if echo $q | egrep \(Error\|failed\)
151 then
152 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
153 printprocesslog "WARN checking query ($queuesys) failed"
154 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
155 nextscript sleeptime $sleeptime
156 fi
157 # FIXME: sge cuts scriptname to 8 digits in qstat
158 # get processes of user in queue
159 q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
160 queued=${#q1[@]}
161 # get scripts in queue
162 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
163 queuedscript=${#q2[@]}
164 # get running scripts
165 q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\)` )
166 runningscript=${#q3[@]}
167 stillinqueue=`echo $queuedscript - $runningscript | bc `
168
169 #get total number of allowed process for current time
170 hour=`date +%k`
171 if [ ${pnototal[$hour]} -lt $totalmax ]
172 then
173 totalpno=${pnototal[$hour]}
174 else
175 totalpno=$totalmax
176 fi
177
178 #choose array according to the day of the week
179 dayofweek=`date +%u`
180 case $dayofweek in
181 0 | 6) pnos=( ${pnoswe[@]} ) ;;
182 *) pnos=( ${pnosweek[@]} ) ;;
183 esac
184 # get number of allowed scripts for current time
185 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
186 pnoscript=${pnos[$num]}
187 # if there was nothing to do for previous script, more scripts can be allowed
188 if [ $prev -eq 0 ]
189 then
190 echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
191 pnoscript=$max
192 fi
193 echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
194 echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
195
196 # continue if there are already enough processes or scripts in the queue
197 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
198 then
199 nextscript sleeptime $sleeptime
200 fi
201 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
202 if [ $numproc -le $stillinqueue ]
203 then
204 echo `date +%F\ %T`" \$numproc ($numproc) < \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
205 nextscript sleeptime $sleeptime
206 fi
207
208 # reset prev
209 prev=$max
210
211 # submit 1 script to queuing system
212 date=`date +%Y-%m-%d`
213 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
214 if ! queuesubmit 2>> $jmerrorlog
215 then
216 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
217 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
218 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
219 if [ $errorsleeptime -lt $sleeptimelimit ]
220 then
221 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
222 fi
223 nextscript errorsleeptime $errorsleeptime
224 else
225 errorsleeptime=$errorsleeptimedefault
226 fi
227 nextscript sleeptime $sleeptime
228 done
229done
230
Note: See TracBrowser for help on using the repository browser.