source: trunk/MagicSoft/Mars/datacenter/scripts/jobmanager @ 9539

Last change on this file since 9539 was 9539, checked in by snruegam, 11 years ago
*** empty log message ***
  • Property svn:executable set to *
File size: 9.2 KB
Line 
1#!/bin/sh
2#
3# ========================================================================
4#
5# *
6# * This file is part of MARS, the MAGIC Analysis and Reconstruction
7# * Software. It is distributed to you in the hope that it can be a useful
8# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
9# * It is distributed WITHOUT ANY WARRANTY.
10# *
11# * Permission to use, copy, modify and distribute this software and its
12# * documentation for any purpose is hereby granted without fee,
13# * provided that the above copyright notice appear in all copies and
14# * that both that copyright notice and this permission notice appear
15# * in supporting documentation. It is provided "as is" without express
16# * or implied warranty.
17# *
18#
19#
20#   Author(s): Daniela Dorner  05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
21#
22#   Copyright: MAGIC Software Development, 2000-2009
23#
24#
25# ========================================================================
26#
27# This a script, which launches other scripts (all scripts, that are run
28# on primary basis
29#
30
31source `dirname $0`/sourcefile
32printprocesslog "INFO starting $0"
33
34set -C
35
36# function to continue in loop and go to next script
37function nextscript()
38{
39   echo `date +%F\ %T`" sleeping \$$1 = $sleeptime seconds... " >> $jmscriptlog 2>&1
40   sleep $2
41   echo "" >> $jmscriptlog 2>&1
42   continue
43}
44
45echo "" >> $jmscriptlog 2>&1
46echo "" >> $jmscriptlog 2>&1
47echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1
48
49# choose commands according to queueing system (defined in setup)
50case $queuesys in
51      sge)  echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
52            # (-hard) -l hostname=compute-*
53            #   for qstat this returns the jobs running on that node + all jobs in the queue
54            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
55#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
56#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
57            # FIXME: get complete scriptname (including command line option), needed for runstereo
58            alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\`  | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
59            break
60            ;;
61   condor)  echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
62            alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
63            alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
64            break
65            ;;
66        *)  echo "" >> $jmscriptlog 2>&1
67            finish >> $jmscriptlog 2>&1
68            ;;
69esac
70
71echo "" >> $jmscriptlog 2>&1
72
73prev=$max
74user=`whoami`
75currentnode=$minnode
76numevaluated=0
77# endless loop
78notcount=0
79nothingtodocount=0
80nothingtodosleeptime=0
81errorsleeptime=$errorsleeptimedefault
82while (( $notcount < 100 ))
83do
84   for (( i=0 ; i < ${#scripts[@]} ; i++ ))
85   do 
86      source `dirname $0`/sourcefile
87      echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
88
89      # check if there's something to do
90      column=${scriptscolname[$i]}
91      getstepinfo
92      if [ "$noderestricted" = "yes" ]
93      then
94         # get number of next node
95         if [ $numevaluated -ge $numrestrictedscripts ]
96         then 
97            currentnode=`echo $currentnode + 1 | bc -l`
98            numevaluated=1
99         else
100            numevaluated=`echo $numevaluated + 1 | bc -l`
101         fi
102         if [ $currentnode -gt $maxnode ]
103         then 
104            currentnode=$minnode
105         fi
106         # check if node is excluded
107         for excludednode in ${excludednodes[@]}
108         do
109            if [ $currentnode -eq $excludednode ]
110            then
111               echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
112               continue 2
113            fi
114         done
115         # define requirement for submission
116         # FIXME: currently only for sge at isdc
117         echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
118         noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
119         noderequirementstat=" -l hostname=compute-0-${currentnode}"
120         getstatus $currentnode >> $jmscriptlog 2>&1
121      else
122         noderequirementsub=""
123         noderequirementstat=""
124         getstatus >> $jmscriptlog 2>&1
125      fi
126     
127      # check number of processes to be done
128      echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1
129      if [ "$numproc" = "0" ]
130      then 
131         prev=0
132         nothingtodocount=`expr $nothingtodocount + 1`
133         if [ $nothingtodocount -lt ${#scripts[@]} ]
134         then 
135            nextscript 0 0
136         else
137            if [ $nothingtodosleeptime -lt $sleeptimelimit ]
138            then 
139               nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
140            fi
141            nextscript nothingtodosleeptime $nothingtodosleeptime
142         fi
143      else
144         nothingtodocount=0
145         nothingtodosleeptime=0
146      fi
147
148      # get processes in queue
149      q=(`checkqueue 2>&1 `)
150      if echo $q | egrep \(Error\|failed\)
151      then 
152         echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
153         printprocesslog "WARN checking query ($queuesys) failed"
154         echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
155         nextscript sleeptime $sleeptime
156      fi
157      # FIXME: sge cuts scriptname to 8 digits in qstat
158      # get processes of user in queue
159      q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
160      queued=${#q1[@]}
161      # get scripts in queue
162      q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
163      queuedscript=${#q2[@]}
164      # get running scripts
165      q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\)` )
166      runningscript=${#q3[@]}
167      stillinqueue=`echo $queuedscript - $runningscript | bc `
168
169      #get total number of allowed process for current time
170      hour=`date +%k`
171      if [ ${pnototal[$hour]} -lt $totalmax ]
172      then
173         totalpno=${pnototal[$hour]}
174      else
175         totalpno=$totalmax
176      fi
177     
178      #choose array according to the day of the week
179      dayofweek=`date +%u`
180      case $dayofweek in
181         0 | 6)  pnos=( ${pnoswe[@]} ) ;;
182             *)  pnos=( ${pnosweek[@]} ) ;;
183      esac
184      # get number of allowed scripts for current time
185      num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
186      pnoscript=${pnos[$num]}
187      # if there was nothing to do for previous script, more scripts can be allowed
188      if [ $prev -eq 0 ]
189      then
190         echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
191         pnoscript=$max
192      fi
193      echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
194      echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
195     
196      # continue if there are already enough processes or scripts in the queue
197      if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
198      then
199         nextscript sleeptime $sleeptime
200      fi
201      # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
202      if [ $numproc -le $stillinqueue ]
203      then 
204         echo `date +%F\ %T`" \$numproc ($numproc) <  \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
205         nextscript sleeptime $sleeptime
206      fi
207     
208      # reset prev
209      prev=$max
210
211      # submit 1 script to queuing system
212      date=`date +%Y-%m-%d`
213      echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
214      if ! queuesubmit 2>> $jmerrorlog
215      then 
216         echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
217         echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
218         printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
219         if [ $errorsleeptime -lt $sleeptimelimit ]
220         then 
221            errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
222         fi
223         nextscript errorsleeptime $errorsleeptime
224      else
225         errorsleeptime=$errorsleeptimedefault
226      fi
227      nextscript sleeptime $sleeptime
228   done
229done
230
Note: See TracBrowser for help on using the repository browser.