source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 9397

Last change on this file since 9397 was 9355, checked in by Daniela Dorner, 16 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 8.6 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2007
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36echo "" >> $jmscriptlog 2>&1
37echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1
38
39# decide which jobmanager you want to run
40# setup of the different jobmanagers (which scripts they start)
41# the number of jobs are defined in the file setup
42case $1 in
43 data) echo "running jobmanager for data" >> $jmscriptlog 2>&1
44 scripts=( "runganymed" "runstar" "runcallisto" ) # not used: "dodatacheck" "cutslices"
45 scriptscolname=( "fGanymed" "fStar" "fCallisto" ) # not used: "fDataCheckDone" "fCompmux"
46 pnosweek=( ${pnoganymed[@]} ${pnostar[@]} ${pnocallisto[@]} ) # not used: ${pnodatacheck[@]} ${pnocutslices[@]}
47 pnoswe=( ${pnoganymedwe[@]} ${pnostarwe[@]} ${pnocallistowe[@]} ) # not used: ${pnodatacheckwe[@]} ${pnocutsliceswe[@]}
48 break
49 ;;
50 mc) echo "running jobmanager for mc" >> $jmscriptlog 2>&1
51 scripts=( "runcorsika" "runreflector" "runcamera" )
52 scriptscolname=( "fCorsikaFileAvail" "fReflectorFileAvail" "fCameraFileAvail" )
53 pnosweek=( ${pnocorsika[@]} ${pnoreflector[@]} ${pnocamera[@]} )
54 pnoswe=( ${pnocorsikawe[@]} ${pnoreflectorwe[@]} ${pnocamerawe[@]} )
55 break
56 ;;
57 *) echo "$1 is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1
58 printprocesslog "WARN $1 is wrong commandline option for jobmanager"
59 finish >> $jmscriptlog 2>&1
60 break
61 ;;
62esac
63
64# choose commands according to queueing system (defined in setup)
65case $queuesys in
66 sge) echo "setting commands for sun grid engine" >> $jmscriptlog 2>&1
67 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
68# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
69 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
70 break
71 ;;
72 condor) echo "setting commands for condor" >> $jmscriptlog 2>&1
73 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor'
74 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
75 break
76 ;;
77 *) finish >> $jmscriptlog 2>&1
78 ;;
79esac
80
81prev=$max
82user=`whoami`
83# endless loop
84notcount=0
85nothingtodocount=0
86nothingtodosleeptime=0
87errorsleeptime=$errorsleeptimedefault
88while (( $notcount < 100 ))
89do
90 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
91 do
92 date >> $jmscriptlog 2>&1
93 source `dirname $0`/sourcefile
94 echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1
95
96 # check if there's something to do
97 column=${scriptscolname[$i]}
98 getstatus >> $jmscriptlog 2>&1
99 echo " $numproc ${scripts[$i]} still to do" >> $jmscriptlog 2>&1
100 if [ "$numproc" = "" ]
101 then
102 prev=0
103 nothingtodocount=`expr $nothingtodocount + 1`
104 if [ $nothingtodocount -lt ${#scripts[@]} ]
105 then
106 cont >> $jmscriptlog 2>&1
107 else
108 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
109 then
110 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
111 fi
112 echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1
113 sleep $nothingtodosleeptime
114 cont >> $jmscriptlog 2>&1
115 fi
116 else
117 nothingtodocount=0
118 nothingtodosleeptime=0
119 fi
120
121 echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1
122 sleep $sleeptime
123
124 # get processes in queue
125# q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
126# q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `)
127 q=(`checkqueue 2>&1 `)
128 if echo $q | egrep \(Error\|failed\)
129 then
130 echo `date`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
131 printprocesslog "WARN checking query ($queuesys) failed"
132 echo `date`" WARN checking query ($queuesys) failed" >> $jmerrorlog
133 cont >> $jmscriptlog 2>&1
134 fi
135 # get processes of user in queue
136 q1=(`echo ${q[@]} | egrep -o Owner$user`)
137 queued=${#q1[@]}
138 # get scripts in queue
139 q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
140 queuedscript=${#q2[@]}
141 # get running scripts
142 q3=( `echo ${q[@]} | egrep -o \(${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` )
143 runningscript=${#q3[@]}
144 stillinqueue=`echo $queuedscript - $runningscript | bc `
145
146 #get total number of allowed process for current time
147 hour=`date +%k`
148 totalpno=${pnototal[$hour]}
149 #choose array according to the day of the week
150 dayofweek=`date +%u`
151 case $dayofweek in
152 0 | 6) pnos=( ${pnoswe[@]} ) ;;
153 *) pnos=( ${pnosweek[@]} ) ;;
154 esac
155 # get number of allowed scripts for current time
156 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
157 pnoscript=${pnos[$num]}
158 # if there was nothing to do for previous script, more scripts can be allowed
159 if [ $prev -eq 0 ]
160 then
161 echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1
162 pnoscript=$max
163 fi
164 echo " found $queued jobs in the queue (incl. running jobs) [allowed $totalpno]" >> $jmscriptlog 2>&1
165 echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1
166
167 # continue if there are already enough processes or scripts in the queue
168 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
169 then
170 cont >> $jmscriptlog 2>&1
171 fi
172 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
173 if [ $numproc -le $stillinqueue ]
174 then
175 echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1
176 cont >> $jmscriptlog 2>&1
177 fi
178
179 # reset prev
180 prev=$max
181
182 # submit 1 script to queuing system
183 date=`date +%Y-%m-%d`
184 echo " committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
185# if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
186# if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog
187 if ! queuesubmit 2>> $jmerrorlog
188 then
189 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
190 echo "$queuesys is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1
191 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
192 if [ $errorsleeptime -lt $sleeptimelimit ]
193 then
194 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
195 fi
196 sleep $errorsleeptime
197 else
198 errorsleeptime=$errorsleeptimedefault
199 fi
200 date >> $jmscriptlog 2>&1
201 echo "" >> $jmscriptlog 2>&1
202 done
203done
204
Note: See TracBrowser for help on using the repository browser.