source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 9474

Last change on this file since 9474 was 9474, checked in by Daniela Dorner, 15 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 9.0 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2007
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36echo "" >> $jmscriptlog 2>&1
37echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1
38
39# decide which jobmanager you want to run
40# setup of the different jobmanagers (which scripts they start)
41# the number of jobs are defined in the file setup
42case $1 in
43 data) echo "running jobmanager for data" >> $jmscriptlog 2>&1
44 scripts=( "runganymed" "runstar" "runcallisto" ) # not used: "dodatacheck" "cutslices"
45 scriptscolname=( "fGanymed" "fStar" "fCallisto" ) # not used: "fDataCheckDone" "fCompmux"
46 pnosweek=( ${pnoganymed[@]} ${pnostar[@]} ${pnocallisto[@]} ) # not used: ${pnodatacheck[@]} ${pnocutslices[@]}
47 pnoswe=( ${pnoganymedwe[@]} ${pnostarwe[@]} ${pnocallistowe[@]} ) # not used: ${pnodatacheckwe[@]} ${pnocutsliceswe[@]}
48 break
49 ;;
50 mc) echo "running jobmanager for mc" >> $jmscriptlog 2>&1
51 scripts=( "runcorsika" "runreflector" "runcamera" )
52 scriptscolname=( "fCorsikaFileAvail" "fReflectorFileAvail" "fCameraFileAvail" )
53 pnosweek=( ${pnocorsika[@]} ${pnoreflector[@]} ${pnocamera[@]} )
54 pnoswe=( ${pnocorsikawe[@]} ${pnoreflectorwe[@]} ${pnocamerawe[@]} )
55 break
56 ;;
57 ctamc) echo "running jobmanager for cta mc" >> $jmscriptlog 2>&1
58 scripts=( "runsimtel" )
59 scriptscolname=( "fCorsikaSimTelarray" )
60 pnosweek=( ${pnosimtel[@]} )
61 pnoswe=( ${pnosimtelwe[@]} )
62 break
63 ;;
64 *) echo "'$1' is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1
65 echo "'$1' is a wrong commandline option for jobmanager -> exit"
66 printprocesslog "WARN '$1' is a wrong commandline option for jobmanager"
67 finish >> $jmscriptlog 2>&1
68 break
69 ;;
70esac
71
72# choose commands according to queueing system (defined in setup)
73case $queuesys in
74 sge) echo "setting commands for sun grid engine" >> $jmscriptlog 2>&1
75 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
76# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
77 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
78 break
79 ;;
80 condor) echo "setting commands for condor" >> $jmscriptlog 2>&1
81 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor'
82 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
83 break
84 ;;
85 *) finish >> $jmscriptlog 2>&1
86 ;;
87esac
88
89prev=$max
90user=`whoami`
91# endless loop
92notcount=0
93nothingtodocount=0
94nothingtodosleeptime=0
95errorsleeptime=$errorsleeptimedefault
96while (( $notcount < 100 ))
97do
98 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
99 do
100 date >> $jmscriptlog 2>&1
101 source `dirname $0`/sourcefile
102 echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1
103
104 # check if there's something to do
105 column=${scriptscolname[$i]}
106 getstatus >> $jmscriptlog 2>&1
107 echo " $numproc ${scripts[$i]} still to do" >> $jmscriptlog 2>&1
108 if [ "$numproc" = "" ]
109 then
110 prev=0
111 nothingtodocount=`expr $nothingtodocount + 1`
112 if [ $nothingtodocount -lt ${#scripts[@]} ]
113 then
114 cont >> $jmscriptlog 2>&1
115 else
116 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
117 then
118 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
119 fi
120 echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1
121 sleep $nothingtodosleeptime
122 cont >> $jmscriptlog 2>&1
123 fi
124 else
125 nothingtodocount=0
126 nothingtodosleeptime=0
127 fi
128
129 echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1
130 sleep $sleeptime
131
132 # get processes in queue
133# q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
134# q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `)
135 q=(`checkqueue 2>&1 `)
136 if echo $q | egrep \(Error\|failed\)
137 then
138 echo `date`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
139 printprocesslog "WARN checking query ($queuesys) failed"
140 echo `date`" WARN checking query ($queuesys) failed" >> $jmerrorlog
141 cont >> $jmscriptlog 2>&1
142 fi
143 # get processes of user in queue
144 q1=(`echo ${q[@]} | egrep -o Owner$user`)
145 queued=${#q1[@]}
146 # get scripts in queue
147 q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
148 queuedscript=${#q2[@]}
149 # get running scripts
150 q3=( `echo ${q[@]} | egrep -o \(${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` )
151 runningscript=${#q3[@]}
152 stillinqueue=`echo $queuedscript - $runningscript | bc `
153
154 #get total number of allowed process for current time
155 hour=`date +%k`
156 #totalpno=${pnototal[$hour]}
157 if [ ${pnototal[$hour]} -lt $totalmax ]
158 then
159 totalpno=${pnototal[$hour]}
160 else
161 totalpno=$totalmax
162 fi
163
164 #choose array according to the day of the week
165 dayofweek=`date +%u`
166 case $dayofweek in
167 0 | 6) pnos=( ${pnoswe[@]} ) ;;
168 *) pnos=( ${pnosweek[@]} ) ;;
169 esac
170 # get number of allowed scripts for current time
171 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
172 pnoscript=${pnos[$num]}
173 # if there was nothing to do for previous script, more scripts can be allowed
174 if [ $prev -eq 0 ]
175 then
176 echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1
177 pnoscript=$max
178 fi
179 echo " found $queued jobs in the queue (incl. running jobs) [allowed $totalpno]" >> $jmscriptlog 2>&1
180 echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1
181
182 # continue if there are already enough processes or scripts in the queue
183 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
184 then
185 cont >> $jmscriptlog 2>&1
186 fi
187 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
188 if [ $numproc -le $stillinqueue ]
189 then
190 echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1
191 cont >> $jmscriptlog 2>&1
192 fi
193
194 # reset prev
195 prev=$max
196
197 # submit 1 script to queuing system
198 date=`date +%Y-%m-%d`
199 echo " committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
200# if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
201# if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog
202 if ! queuesubmit 2>> $jmerrorlog
203 then
204 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
205 echo "$queuesys is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1
206 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
207 if [ $errorsleeptime -lt $sleeptimelimit ]
208 then
209 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
210 fi
211 sleep $errorsleeptime
212 else
213 errorsleeptime=$errorsleeptimedefault
214 fi
215 date >> $jmscriptlog 2>&1
216 echo "" >> $jmscriptlog 2>&1
217 done
218done
219
Note: See TracBrowser for help on using the repository browser.