- Timestamp:
- 08/14/09 17:14:50 (15 years ago)
- Location:
- trunk/MagicSoft/Mars
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/MagicSoft/Mars/Changelog
r9491 r9492 18 18 19 19 -*-*- END OF LINE -*-*- 20 21 2009/08/14 Daniela Dorner 22 23 * datacenter/scripts/sourcefile: 24 - function getstatus: return 0, if variable $numprocs is empty 25 - moved call of getdbinfo() to function getstepinfo where possible 26 - enhanced function getstepinfo: implemented retrieving variable 27 NodeRestricted from steps.rc 28 In steps.rc a line 'Table.Column.NodeRestricted: yes' has to be 29 added when a step has to be executed on a certain node (stored 30 in the database in MCRunProcessStatus.fProductionHostKEY) 31 32 * datacenter/scripts/jobmanager: 33 - implemented possibility to send certain jobs to certain nodes 34 of the cluster (currently implemented only for sun grid engine) 35 - fix for greping the scriptname from the qstat output (needed 36 for scripts with commandline option) 37 - bugfix: pass environment variable AUTOMATIONSETUP to sun grid 38 engine 39 40 * datacenter/scripts/setup.isdc.cta: 41 - added analysis scripts for jobmanager 42 - added variables concerning the cluster: number of nodes and 43 excluded nodes (needed in the jobmanager for the changes 44 above) 45 46 20 47 21 48 2009/08/14 Daniel Hoehne-Moench -
trunk/MagicSoft/Mars/datacenter/scripts/jobmanager
r9489 r9492 50 50 case $queuesys in 51 51 sge) echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1 52 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` ' 52 # (-hard) -l hostname=compute-* 53 # for qstat this returns the jobs running on that node + all jobs in the queue 54 alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` ' 55 # alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` ' 53 56 # alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge ' 54 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '" 57 # FIXME: get complete scriptname (including command line option), needed for runstereo 58 alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\` | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '" 55 59 break 56 60 ;; … … 69 73 prev=$max 70 74 user=`whoami` 75 currentnode=$minnode 76 numevaluated=0 71 77 # endless loop 72 78 notcount=0 … … 83 89 # check if there's something to do 84 90 column=${scriptscolname[$i]} 85 getstatus >> $jmscriptlog 2>&1 91 getstepinfo 92 if [ "$noderestricted" = "yes" ] 93 then 94 # get number of next node 95 if [ $numevaluated -ge $numrestrictedscripts ] 96 then 97 currentnode=`echo $currentnode + 1 | bc -l` 98 numevaluated=1 99 else 100 numevaluated=`echo $numevaluated + 1 | bc -l` 101 fi 102 if [ $currentnode -gt $maxnode ] 103 then 104 currentnode=$minnode 105 fi 106 # check if node is excluded 107 for excludednode in ${excludednodes[@]} 108 do 109 if [ $currentnode -eq $excludednode ] 110 then 111 echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1 112 continue 2 113 fi 114 done 115 # define requirement for submission 116 # FIXME: currently only for sge at isdc 117 echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1 118 noderequirementsub=" -hard -l hostname=compute-0-${currentnode}" 119 noderequirementstat=" -l hostname=compute-0-${currentnode}" 120 getstatus $currentnode >> $jmscriptlog 2>&1 121 else 122 noderequirementsub="" 123 noderequirementstat="" 124 getstatus >> $jmscriptlog 2>&1 125 fi 126 127 # check number of processes to be done 86 128 echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1 87 129 if [ "$numproc" = "" ] … … 105 147 106 148 # get processes in queue 107 # q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)108 # q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `)109 149 q=(`checkqueue 2>&1 `) 110 150 if echo $q | egrep \(Error\|failed\) … … 115 155 nextscript sleeptime $sleeptime 116 156 fi 157 # FIXME: sge cuts scriptname to 8 digits in qstat 117 158 # get processes of user in queue 118 q1=( `echo ${q[@]} | egrep -o Owner$user`)159 q1=( `echo ${q[@]} | egrep -o "Owner$user"`) 119 160 queued=${#q1[@]} 120 161 # get scripts in queue 121 q2=( `echo ${q[@]} | egrep -o ${scripts[$i]}`)162 q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`) 122 163 queuedscript=${#q2[@]} 123 164 # get running scripts 124 q3=( `echo ${q[@]} | egrep -o \( ${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` )165 q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\)` ) 125 166 runningscript=${#q3[@]} 126 167 stillinqueue=`echo $queuedscript - $runningscript | bc ` … … 171 212 date=`date +%Y-%m-%d` 172 213 echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1 173 # if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog174 # if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog175 214 if ! queuesubmit 2>> $jmerrorlog 176 215 then -
trunk/MagicSoft/Mars/datacenter/scripts/setup.isdc.cta
r9489 r9492 89 89 sleeptimelimit=360 #360 90 90 errorsleeptimedefault=60 #60 91 max=500 #maximum number of processes 92 max=25 #maximum number of processes for one script in case there are more than one and the others do not have anything to do 91 max=50 #maximum number of processes for one script in case there are more than one and the others do not have anything to do 93 92 totalmax=1600 #maximum number of processes (total) overwrites pnototal(we) in case it is smaller 94 93 … … 106 105 pnototalwe=( 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 ) 107 106 108 pnosimtel=( 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 ) 109 pnosimtelwe=( 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 ) 107 pnosimtel=( 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 ) 108 pnosimtelwe=( 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 ) 109 110 # restricted to nodes => numbers per node 111 pnochimp=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 112 pnochimpwe=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 113 pnoctastar=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 114 pnoctastarwe=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 115 pnostereob=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 116 pnostereobwe=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 117 pnostereoc=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 118 pnostereocwe=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 119 pnostereog=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 120 pnostereogwe=( 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 ) 110 121 111 122 # set variables for jobmanager 112 scripts=( "runsimtel" ) 113 scriptscolname=( "fCorsikaSimTelarray" ) 114 pnosweek=( ${pnosimtel[@]} ) 115 pnoswe=( ${pnosimtelwe[@]} ) 123 #scripts=( "runsimtel" "runchimp" "runctastar" "runstereo 2" "runstereo 3" "runstereo 7" ) 124 #scriptscolname=( "fCorsikaSimTelarray" "fChimp" "fCTAStar" "fStereoB" "fStereoC" "fStereoG" ) 125 #pnosweek=( ${pnosimtel[@]} ${pnochimp[@]} ${pnoctastar[@]} ${pnostereob[@]} ${pnostereoc[@]} ${pnostereog[@]} ) 126 #pnoswe=( ${pnosimtelwe[@]} ${pnochimpwe[@]} ${pnoctastarwe[@]} ${pnostereobwe[@]} ${pnostereocwe[@]} ${pnostereogwe[@]} ) 127 scripts=( "runsimtel" "runchimp" "runctastar" ) 128 scriptscolname=( "fCorsikaSimTelarray" "fChimp" "fCTAStar" ) 129 pnosweek=( ${pnosimtel[@]} ${pnochimp[@]} ${pnoctastar[@]} ) 130 pnoswe=( ${pnosimtelwe[@]} ${pnochimpwe[@]} ${pnoctastarwe[@]} ) 116 131 132 # number of script that are limited to a certain node 133 #numrestrictedscripts=5 134 numrestrictedscripts=2 135 136 # set up for nodes 137 # for processes that can run only on certain node 138 # db: fProductionHostKEY 139 minnode=0 140 maxnode=26 141 excludednodes=( 7 14 23 ) -
trunk/MagicSoft/Mars/datacenter/scripts/sourcefile
r9489 r9492 274 274 function getstepinfo() 275 275 { 276 getdbsetup 276 277 table=`grep "$column:" $steps | sed -e "s/[.]$column://" -e 's/#//' -e 's/ //g'` 277 278 coltab=`grep "$column:" $steps | sed -e 's/://' -e 's/#//' -e 's/ //g'` 278 279 needs=`grep "$coltab[.]Needs:" $steps | sed -e "s/$coltab[.]Needs://"` 280 noderestricted=`grep "$coltab[.]NodeRestricted:" $steps | sed -e "s/$coltab[.]NodeRestricted://" -e 's/ //g'` 279 281 influences=`grep "$coltab[.]Influences:" $steps | sed -e "s/$coltab[.]Influences://"` 280 282 prims=( `grep "$table[.]Primary:" $steps | sed -e "s/$table[.]Primary://"` ) … … 282 284 # echo " needs: $needs" 283 285 # echo " influences: $influences" 286 # echo " noderestricted: $noderestricted" 284 287 # echo " prims: ${prims[@]}" 285 288 } … … 290 293 process= 291 294 printprocesslog "INFO getting todo..." 292 getdbsetup293 295 getstepinfo 294 296 # get query … … 340 342 { 341 343 numproc= 342 getdbsetup343 344 getstepinfo 344 345 # get query … … 366 367 continue 367 368 fi 369 if [ "$numproc" = "" ] 370 then 371 numproc=0 372 fi 368 373 } 369 374 … … 377 382 resetstatusvalues 378 383 evalstatus $@ 379 getdbsetup380 384 getstepinfo 381 385 # get query
Note:
See TracChangeset
for help on using the changeset viewer.