Changeset 9129 for trunk/MagicSoft/Mars/datacenter/scripts/jobmanager
- Timestamp:
- 08/21/08 18:49:35 (16 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/MagicSoft/Mars/datacenter/scripts/jobmanager
r9122 r9129 37 37 echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1 38 38 39 if [ "$1" == "" ] 40 then 41 echo "running jobmanager for data" >> $jmscriptlog 2>&1 42 scripts=( ${datascripts[@]} ) 43 scriptscolname=( ${datascriptscolname[@]} ) 44 elif [ "$1" = "mc" ] 45 then 46 echo "running jobmanager for mc" >> $jmscriptlog 2>&1 47 scripts=( ${mcscripts[@]} ) 48 scriptscolname=( ${mcscriptscolname[@]} ) 49 else 50 echo "$1 is awrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1 51 printprocesslog "WARN $1 is wrong commandline option for jobmanager" 52 finish >> $jmscriptlog 2>&1 53 fi 39 # decide which jobmanager you want to run 40 # setup of the different jobmanagers (which scripts they start) 41 # the number of jobs are defined in the file setup 42 case $1 in 43 data) echo "running jobmanager for data" >> $jmscriptlog 2>&1 44 scripts=( "runganymed" "runstar" "runcallisto" ) # not used: "dodatacheck" "cutslices" 45 scriptscolname=( "fGanymed" "fStar" "fCallisto" ) # not used: "fDataCheckDone" "fCompmux" 46 pnosweek=( ${pnoganymed[@]} ${pnostar[@]} ${pnocallisto[@]} ) # not used: ${pnodatacheck[@]} ${pnocutslices[@]} 47 pnoswe=( ${pnoganymedwe[@]} ${pnostarwe[@]} ${pnocallistowe[@]} ) # not used: ${pnodatacheckwe[@]} ${pnocutsliceswe[@]} 48 break 49 ;; 50 mc) echo "running jobmanager for mc" >> $jmscriptlog 2>&1 51 scripts=( "runcorsika" "runreflector" "runcamera" ) 52 scriptscolname=( "fCorsikaFileAvail" "fReflectorFileAvail" "fCameraFileAvail" ) 53 pnosweek=( ${pnocorsika[@]} ${pnoreflector[@]} ${pnocamera[@]} ) 54 pnoswe=( ${pnocorsikawe[@]} ${pnoreflectorwe[@]} ${pnocamerawe[@]} ) 55 break 56 ;; 57 *) echo "$1 is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1 58 printprocesslog "WARN $1 is wrong commandline option for jobmanager" 59 finish >> $jmscriptlog 2>&1 60 break 61 ;; 62 esac 63 64 # choose commands according to queueing system (defined in setup) 65 case $queuesys in 66 sge) echo "setting commands for sun grid engine" >> $jmscriptlog 2>&1 67 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` ' 68 # alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge ' 69 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '" 70 break 71 ;; 72 condor) echo "setting commands for condor" >> $jmscriptlog 2>&1 73 alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor' 74 alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus ' 75 break 76 ;; 77 *) finish >> $jmscriptlog 2>&1 78 ;; 79 esac 54 80 55 81 prev=$max … … 59 85 nothingtodocount=0 60 86 nothingtodosleeptime=0 87 errorsleeptime=$errorsleeptimedefault 61 88 while (( $notcount < 100 )) 62 89 do … … 96 123 97 124 # get processes in queue 98 q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `) 125 # q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `) 126 # q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `) 127 q=(`checkqueue 2>&1 `) 99 128 if echo $q | egrep \(Error\|failed\) 100 129 then 101 echo `date`" WARN c ondor_qfailed" >> $jmscriptlog 2>&1102 printprocesslog "WARN c ondor_qfailed"103 echo `date`" WARN c ondor_qfailed" >> $jmerrorlog130 echo `date`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1 131 printprocesslog "WARN checking query ($queuesys) failed" 132 echo `date`" WARN checking query ($queuesys) failed" >> $jmerrorlog 104 133 cont >> $jmscriptlog 2>&1 105 134 fi … … 111 140 queuedscript=${#q2[@]} 112 141 # get running scripts 113 q3=( `echo ${q[@]} | egrep -o ${scripts[$i]}Jobstatus2`)142 q3=( `echo ${q[@]} | egrep -o \(${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` ) 114 143 runningscript=${#q3[@]} 115 144 stillinqueue=`echo $queuedscript - $runningscript | bc ` … … 151 180 prev=$max 152 181 153 # submit 1 script to condor182 # submit 1 script to queuing system 154 183 date=`date +%Y-%m-%d` 155 echo " committing 1 ${scripts[$i]} to condor" >> $jmscriptlog 2>&1 156 if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog 157 then 158 echo `date`" WARN condor_submit failed" >> $jmerrorlog 159 echo "condor is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1 160 printprocesslog "WARN submitting ${scripts[$i]} to condor failed" 184 echo " committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1 185 # if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog 186 # if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog 187 if ! queuesubmit 2>> $jmerrorlog 188 then 189 echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog 190 echo "$queuesys is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1 191 printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed" 161 192 if [ $errorsleeptime -lt $sleeptimelimit ] 162 193 then
Note:
See TracChangeset
for help on using the changeset viewer.