source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 9489

Last change on this file since 9489 was 9489, checked in by Daniela Dorner, 15 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 7.8 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2009
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36# function to continue in loop and go to next script
37function nextscript()
38{
39 echo `date +%F\ %T`" sleeping \$$1 = $sleeptime seconds... " >> $jmscriptlog 2>&1
40 sleep $2
41 echo "" >> $jmscriptlog 2>&1
42 continue
43}
44
45echo "" >> $jmscriptlog 2>&1
46echo "" >> $jmscriptlog 2>&1
47echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
48
49# choose commands according to queueing system (defined in setup)
50case $queuesys in
51 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
52 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
53# alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
54 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
55 break
56 ;;
57 condor) echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
58 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor'
59 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
60 break
61 ;;
62 *) echo "" >> $jmscriptlog 2>&1
63 finish >> $jmscriptlog 2>&1
64 ;;
65esac
66
67echo "" >> $jmscriptlog 2>&1
68
69prev=$max
70user=`whoami`
71# endless loop
72notcount=0
73nothingtodocount=0
74nothingtodosleeptime=0
75errorsleeptime=$errorsleeptimedefault
76while (( $notcount < 100 ))
77do
78 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
79 do
80 source `dirname $0`/sourcefile
81 echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
82
83 # check if there's something to do
84 column=${scriptscolname[$i]}
85 getstatus >> $jmscriptlog 2>&1
86 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1
87 if [ "$numproc" = "" ]
88 then
89 prev=0
90 nothingtodocount=`expr $nothingtodocount + 1`
91 if [ $nothingtodocount -lt ${#scripts[@]} ]
92 then
93 nextscript 0 0
94 else
95 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
96 then
97 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
98 fi
99 nextscript nothingtodosleeptime $nothingtodosleeptime
100 fi
101 else
102 nothingtodocount=0
103 nothingtodosleeptime=0
104 fi
105
106 # get processes in queue
107# q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
108# q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `)
109 q=(`checkqueue 2>&1 `)
110 if echo $q | egrep \(Error\|failed\)
111 then
112 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
113 printprocesslog "WARN checking query ($queuesys) failed"
114 echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
115 nextscript sleeptime $sleeptime
116 fi
117 # get processes of user in queue
118 q1=(`echo ${q[@]} | egrep -o Owner$user`)
119 queued=${#q1[@]}
120 # get scripts in queue
121 q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
122 queuedscript=${#q2[@]}
123 # get running scripts
124 q3=( `echo ${q[@]} | egrep -o \(${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` )
125 runningscript=${#q3[@]}
126 stillinqueue=`echo $queuedscript - $runningscript | bc `
127
128 #get total number of allowed process for current time
129 hour=`date +%k`
130 if [ ${pnototal[$hour]} -lt $totalmax ]
131 then
132 totalpno=${pnototal[$hour]}
133 else
134 totalpno=$totalmax
135 fi
136
137 #choose array according to the day of the week
138 dayofweek=`date +%u`
139 case $dayofweek in
140 0 | 6) pnos=( ${pnoswe[@]} ) ;;
141 *) pnos=( ${pnosweek[@]} ) ;;
142 esac
143 # get number of allowed scripts for current time
144 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
145 pnoscript=${pnos[$num]}
146 # if there was nothing to do for previous script, more scripts can be allowed
147 if [ $prev -eq 0 ]
148 then
149 echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
150 pnoscript=$max
151 fi
152 echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
153 echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
154
155 # continue if there are already enough processes or scripts in the queue
156 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
157 then
158 nextscript sleeptime $sleeptime
159 fi
160 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
161 if [ $numproc -le $stillinqueue ]
162 then
163 echo `date +%F\ %T`" \$numproc ($numproc) < \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
164 nextscript sleeptime $sleeptime
165 fi
166
167 # reset prev
168 prev=$max
169
170 # submit 1 script to queuing system
171 date=`date +%Y-%m-%d`
172 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
173# if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
174# if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog
175 if ! queuesubmit 2>> $jmerrorlog
176 then
177 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
178 echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
179 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
180 if [ $errorsleeptime -lt $sleeptimelimit ]
181 then
182 errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
183 fi
184 nextscript errorsleeptime $errorsleeptime
185 else
186 errorsleeptime=$errorsleeptimedefault
187 fi
188 nextscript sleeptime $sleeptime
189 done
190done
191
Note: See TracBrowser for help on using the repository browser.