source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 9457

Last change on this file since 9457 was 9457, checked in by Daniela Dorner, 16 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 8.9 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2007
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36echo "" >> $jmscriptlog 2>&1
37echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1
38
39# decide which jobmanager you want to run
40# setup of the different jobmanagers (which scripts they start)
41# the number of jobs are defined in the file setup
42case $1 in
43 data) echo "running jobmanager for data" >> $jmscriptlog 2>&1
44 scripts=( "runganymed" "runstar" "runcallisto" ) # not used: "dodatacheck" "cutslices"
45 scriptscolname=( "fGanymed" "fStar" "fCallisto" ) # not used: "fDataCheckDone" "fCompmux"
46 pnosweek=( ${pnoganymed[@]} ${pnostar[@]} ${pnocallisto[@]} ) # not used: ${pnodatacheck[@]} ${pnocutslices[@]}
47 pnoswe=( ${pnoganymedwe[@]} ${pnostarwe[@]} ${pnocallistowe[@]} ) # not used: ${pnodatacheckwe[@]} ${pnocutsliceswe[@]}
48 break
49 ;;
50 mc) echo "running jobmanager for mc" >> $jmscriptlog 2>&1
51 scripts=( "runcorsika" "runreflector" "runcamera" )
52 scriptscolname=( "fCorsikaFileAvail" "fReflectorFileAvail" "fCameraFileAvail" )
53 pnosweek=( ${pnocorsika[@]} ${pnoreflector[@]} ${pnocamera[@]} )
54 pnoswe=( ${pnocorsikawe[@]} ${pnoreflectorwe[@]} ${pnocamerawe[@]} )
55 break
56 ;;
57 ctamc) echo "running jobmanager for cta mc" >> $jmscriptlog 2>&1
58 scripts=( "runsimtel" )
59 scriptscolname=( "fCorsikaSimtelArray" )
60 pnosweek=( ${pnosimtel[@]} )
61 pnoswe=( ${pnosimtelwe[@]} )
62 break
63 ;;
64 *) echo "$1 is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1
65 printprocesslog "WARN $1 is wrong commandline option for jobmanager"
66 finish >> $jmscriptlog 2>&1
67 break
68 ;;
69esac
70
71# choose commands according to queueing system (defined in setup)
72case $queuesys in
73 sge) echo "setting commands for sun grid engine" >> $jmscriptlog 2>&1
74 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
75# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
76 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
77 break
78 ;;
79 condor) echo "setting commands for condor" >> $jmscriptlog 2>&1
80 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor'
81 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
82 break
83 ;;
84 *) finish >> $jmscriptlog 2>&1
85 ;;
86esac
87
88prev=$max
89user=`whoami`
90# endless loop
91notcount=0
92nothingtodocount=0
93nothingtodosleeptime=0
94errorsleeptime=$errorsleeptimedefault
95while (( $notcount < 100 ))
96do
97 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
98 do
99 date >> $jmscriptlog 2>&1
100 source `dirname $0`/sourcefile
101 echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1
102
103 # check if there's something to do
104 column=${scriptscolname[$i]}
105 getstatus >> $jmscriptlog 2>&1
106 echo " $numproc ${scripts[$i]} still to do" >> $jmscriptlog 2>&1
107 if [ "$numproc" = "" ]
108 then
109 prev=0
110 nothingtodocount=`expr $nothingtodocount + 1`
111 if [ $nothingtodocount -lt ${#scripts[@]} ]
112 then
113 cont >> $jmscriptlog 2>&1
114 else
115 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
116 then
117 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
118 fi
119 echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1
120 sleep $nothingtodosleeptime
121 cont >> $jmscriptlog 2>&1
122 fi
123 else
124 nothingtodocount=0
125 nothingtodosleeptime=0
126 fi
127
128 echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1
129 sleep $sleeptime
130
131 # get processes in queue
132# q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
133# q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `)
134 q=(`checkqueue 2>&1 `)
135 if echo $q | egrep \(Error\|failed\)
136 then
137 echo `date`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
138 printprocesslog "WARN checking query ($queuesys) failed"
139 echo `date`" WARN checking query ($queuesys) failed" >> $jmerrorlog
140 cont >> $jmscriptlog 2>&1
141 fi
142 # get processes of user in queue
143 q1=(`echo ${q[@]} | egrep -o Owner$user`)
144 queued=${#q1[@]}
145 # get scripts in queue
146 q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
147 queuedscript=${#q2[@]}
148 # get running scripts
149 q3=( `echo ${q[@]} | egrep -o \(${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` )
150 runningscript=${#q3[@]}
151 stillinqueue=`echo $queuedscript - $runningscript | bc `
152
153 #get total number of allowed process for current time
154 hour=`date +%k`
155 #totalpno=${pnototal[$hour]}
156 if [ ${pnototal[$hour]} -lt $totalmax ]
157 then
158 totalpno=${pnototal[$hour]}
159 else
160 totalpno=$totalmax
161 fi
162
163 #choose array according to the day of the week
164 dayofweek=`date +%u`
165 case $dayofweek in
166 0 | 6) pnos=( ${pnoswe[@]} ) ;;
167 *) pnos=( ${pnosweek[@]} ) ;;
168 esac
169 # get number of allowed scripts for current time
170 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
171 pnoscript=${pnos[$num]}
172 # if there was nothing to do for previous script, more scripts can be allowed
173 if [ $prev -eq 0 ]
174 then
175 echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1
176 pnoscript=$max
177 fi
178 echo " found $queued jobs in the queue (incl. running jobs) [allowed $totalpno]" >> $jmscriptlog 2>&1
179 echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1
180
181 # continue if there are already enough processes or scripts in the queue
182 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
183 then
184 cont >> $jmscriptlog 2>&1
185 fi
186 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
187 if [ $numproc -le $stillinqueue ]
188 then
189 echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1
190 cont >> $jmscriptlog 2>&1
191 fi
192
193 # reset prev
194 prev=$max
195
196 # submit 1 script to queuing system
197 date=`date +%Y-%m-%d`
198 echo " committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
199# if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
200# if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog
201 if ! queuesubmit 2>> $jmerrorlog
202 then
203 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
204 echo "$queuesys is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1
205 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
206 if [ $errorsleeptime -lt $sleeptimelimit ]
207 then
208 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
209 fi
210 sleep $errorsleeptime
211 else
212 errorsleeptime=$errorsleeptimedefault
213 fi
214 date >> $jmscriptlog 2>&1
215 echo "" >> $jmscriptlog 2>&1
216 done
217done
218
Note: See TracBrowser for help on using the repository browser.