source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager@ 8409

Last change on this file since 8409 was 8177, checked in by Daniela Dorner, 18 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 5.3 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20# Author(s): Daniela Dorner 05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22# Copyright: MAGIC Software Development, 2000-2006
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36echo "" >> $jmscriptlog 2>&1
37echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1
38
39prev=$max
40user=`whoami`
41# endless loop
42notcount=0
43nothingtodocount=0
44nothingtodosleeptime=0
45while (( $notcount < 100 ))
46do
47 for (( i=0 ; i < ${#scripts[@]} ; i++ ))
48 do
49 date >> $jmscriptlog 2>&1
50 source `dirname $0`/sourcefile
51 echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1
52
53 # check if there's something to do
54 getstatus >> $jmscriptlog 2>&1
55 echo " $numproc ${scripts[$i]} still do to" >> $jmscriptlog 2>&1
56 if [ "$numproc" = "" ]
57 then
58 prev=0
59 nothingtodocount=`expr $nothingtodocount + 1`
60 if [ $nothingtodocount -lt ${#scripts[@]} ]
61 then
62 cont >> $jmscriptlog 2>&1
63 else
64 if [ $nothingtodosleeptime -lt $sleeptimelimit ]
65 then
66 nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
67 fi
68 echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1
69 sleep $nothingtodosleeptime
70 cont >> $jmscriptlog 2>&1
71 fi
72 else
73 nothingtodocount=0
74 nothingtodosleeptime=0
75 fi
76
77 echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1
78 sleep $sleeptime
79
80 # get processes in queue
81 q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
82 if echo $q | egrep \(Error\|failed\)
83 then
84 echo "WARN condor_q failed" >> $jmscriptlog 2>&1
85 printprocesslog "WARN condor_q failed"
86 echo `date`"ERROR condor_q failed" >> $jmerrorlog
87 cont >> $jmscriptlog 2>&1
88 fi
89 # get processes of user in queue
90 q1=(`echo ${q[@]} | egrep -o Owner$user`)
91 queued=${#q1[@]}
92 # get scripts in queue
93 q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
94 queuedscript=${#q2[@]}
95 # get running scripts
96 q3=(`echo ${q[@]} | egrep -o ${scripts[$i]}Jobstatus2`)
97 runningscript=${#q3[@]}
98 stillinqueue=`echo $queuedscript - $runningscript | bc `
99
100 #get total number of allowed process for current time
101 hour=`date +%k`
102 totalpno=${pnototal[$hour]}
103 #choose array according to the day of the week
104 dayofweek=`date +%u`
105 case $dayofweek in
106 0 | 6) pnos=( ${pnoswe[@]} ) ;;
107 *) pnos=( ${pnosweek[@]} ) ;;
108 esac
109 # get number of allowed scripts for current time
110 num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
111 pnoscript=${pnos[$num]}
112 # if there was nothing to do for previous script, more scripts can be allowed
113 if [ $prev -eq 0 ]
114 then
115 echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1
116 pnoscript=$max
117 fi
118 echo " found $queued jobs in the queue (incl. running jobs) [allowed $totalpno]" >> $jmscriptlog 2>&1
119 echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1
120
121 # continue if there are already enough processes or scripts in the queue
122 if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
123 then
124 cont >> $jmscriptlog 2>&1
125 fi
126 # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
127 if [ $numproc -le $stillinqueue ]
128 then
129 echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1
130 cont >> $jmscriptlog 2>&1
131 fi
132
133 # reset prev
134 prev=$max
135
136 # submit 1 script to condor
137 date=`date +%Y-%m-%d`
138 echo " committing 1 ${scripts[$i]} to condor" >> $jmscriptlog 2>&1
139 if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
140 then
141 echo `date`"ERROR condor_submit failed" >> $jmerrorlog
142 echo "condor is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1
143 printprocesslog "WARN submitting ${scripts[$i]} to condor failed"
144 sleep $errorsleeptime
145 fi
146 date >> $jmscriptlog 2>&1
147 echo "" >> $jmscriptlog 2>&1
148 done
149done
150
Note: See TracBrowser for help on using the repository browser.