source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 9122

Last change on this file since 9122 was 9122, checked in by Daniela Dorner, 16 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 6.1 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2007
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36echo "" >> $jmscriptlog 2>&1
37echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1
38
39if [ "$1" == "" ]
40then
41 echo "running jobmanager for data" >> $jmscriptlog 2>&1
42 scripts=( ${datascripts[@]} )
43 scriptscolname=( ${datascriptscolname[@]} )
44elif [ "$1" = "mc" ]
45then
46 echo "running jobmanager for mc" >> $jmscriptlog 2>&1
47 scripts=( ${mcscripts[@]} )
48 scriptscolname=( ${mcscriptscolname[@]} )
49else
50 echo "$1 is awrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1
51 printprocesslog "WARN $1 is wrong commandline option for jobmanager"
52 finish >> $jmscriptlog 2>&1
53fi
54
55prev=$max
56user=`whoami`
57# endless loop
58notcount=0
59nothingtodocount=0
60nothingtodosleeptime=0
61while (( $notcount < 100 ))
62do
63 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
64 do
65 date >> $jmscriptlog 2>&1
66 source `dirname $0`/sourcefile
67 echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1
68
69 # check if there's something to do
70 column=${scriptscolname[$i]}
71 getstatus >> $jmscriptlog 2>&1
72 echo " $numproc ${scripts[$i]} still do to" >> $jmscriptlog 2>&1
73 if [ "$numproc" = "" ]
74 then
75 prev=0
76 nothingtodocount=`expr $nothingtodocount + 1`
77 if [ $nothingtodocount -lt ${#scripts[@]} ]
78 then
79 cont >> $jmscriptlog 2>&1
80 else
81 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
82 then
83 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
84 fi
85 echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1
86 sleep $nothingtodosleeptime
87 cont >> $jmscriptlog 2>&1
88 fi
89 else
90 nothingtodocount=0
91 nothingtodosleeptime=0
92 fi
93
94 echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1
95 sleep $sleeptime
96
97 # get processes in queue
98 q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
99 if echo $q | egrep \(Error\|failed\)
100 then
101 echo `date`" WARN condor_q failed" >> $jmscriptlog 2>&1
102 printprocesslog "WARN condor_q failed"
103 echo `date`" WARN condor_q failed" >> $jmerrorlog
104 cont >> $jmscriptlog 2>&1
105 fi
106 # get processes of user in queue
107 q1=(`echo ${q[@]} | egrep -o Owner$user`)
108 queued=${#q1[@]}
109 # get scripts in queue
110 q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
111 queuedscript=${#q2[@]}
112 # get running scripts
113 q3=(`echo ${q[@]} | egrep -o ${scripts[$i]}Jobstatus2`)
114 runningscript=${#q3[@]}
115 stillinqueue=`echo $queuedscript - $runningscript | bc `
116
117 #get total number of allowed process for current time
118 hour=`date +%k`
119 totalpno=${pnototal[$hour]}
120 #choose array according to the day of the week
121 dayofweek=`date +%u`
122 case $dayofweek in
123 0 | 6) pnos=( ${pnoswe[@]} ) ;;
124 *) pnos=( ${pnosweek[@]} ) ;;
125 esac
126 # get number of allowed scripts for current time
127 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
128 pnoscript=${pnos[$num]}
129 # if there was nothing to do for previous script, more scripts can be allowed
130 if [ $prev -eq 0 ]
131 then
132 echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1
133 pnoscript=$max
134 fi
135 echo " found $queued jobs in the queue (incl. running jobs) [allowed $totalpno]" >> $jmscriptlog 2>&1
136 echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1
137
138 # continue if there are already enough processes or scripts in the queue
139 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
140 then
141 cont >> $jmscriptlog 2>&1
142 fi
143 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
144 if [ $numproc -le $stillinqueue ]
145 then
146 echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1
147 cont >> $jmscriptlog 2>&1
148 fi
149
150 # reset prev
151 prev=$max
152
153 # submit 1 script to condor
154 date=`date +%Y-%m-%d`
155 echo " committing 1 ${scripts[$i]} to condor" >> $jmscriptlog 2>&1
156 if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
157 then
158 echo `date`" WARN condor_submit failed" >> $jmerrorlog
159 echo "condor is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1
160 printprocesslog "WARN submitting ${scripts[$i]} to condor failed"
161 if [ $errorsleeptime -lt $sleeptimelimit ]
162 then
163 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
164 fi
165 sleep $errorsleeptime
166 else
167 errorsleeptime=$errorsleeptimedefault
168 fi
169 date >> $jmscriptlog 2>&1
170 echo "" >> $jmscriptlog 2>&1
171 done
172done
173
Note: See TracBrowser for help on using the repository browser.