Changeset 9492 for trunk


Ignore:
Timestamp:
08/14/09 17:14:50 (15 years ago)
Author:
Daniela Dorner
Message:
*** empty log message ***
Location:
trunk/MagicSoft/Mars
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/MagicSoft/Mars/Changelog

    r9491 r9492  
    1818
    1919                                                 -*-*- END OF LINE -*-*-
     20
     21 2009/08/14 Daniela Dorner
     22
     23   * datacenter/scripts/sourcefile:
     24     - function getstatus: return 0, if variable $numprocs is empty
     25     - moved call of getdbinfo() to function getstepinfo where possible
     26     - enhanced function getstepinfo: implemented retrieving variable
     27       NodeRestricted from steps.rc
     28       In steps.rc a line 'Table.Column.NodeRestricted: yes' has to be
     29       added when a step has to be executed on a certain node (stored
     30       in the database in MCRunProcessStatus.fProductionHostKEY)
     31
     32   * datacenter/scripts/jobmanager:
     33     - implemented possibility to send certain jobs to certain nodes
     34       of the cluster (currently implemented only for sun grid engine)
     35     - fix for greping the scriptname from the qstat output (needed
     36       for scripts with commandline option)
     37     - bugfix: pass environment variable AUTOMATIONSETUP to sun grid
     38       engine
     39
     40   * datacenter/scripts/setup.isdc.cta:
     41     - added analysis scripts for jobmanager
     42     - added variables concerning the cluster: number of nodes and
     43       excluded nodes (needed in the jobmanager for the changes
     44       above)
     45
     46
    2047
    2148 2009/08/14 Daniel Hoehne-Moench
  • trunk/MagicSoft/Mars/datacenter/scripts/jobmanager

    r9489 r9492  
    5050case $queuesys in
    5151      sge)  echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
    52             alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
     52            # (-hard) -l hostname=compute-*
     53            #   for qstat this returns the jobs running on that node + all jobs in the queue
     54            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
     55#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
    5356#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
    54             alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
     57            # FIXME: get complete scriptname (including command line option), needed for runstereo
     58            alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\`  | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
    5559            break
    5660            ;;
     
    6973prev=$max
    7074user=`whoami`
     75currentnode=$minnode
     76numevaluated=0
    7177# endless loop
    7278notcount=0
     
    8389      # check if there's something to do
    8490      column=${scriptscolname[$i]}
    85       getstatus >> $jmscriptlog 2>&1
     91      getstepinfo
     92      if [ "$noderestricted" = "yes" ]
     93      then
     94         # get number of next node
     95         if [ $numevaluated -ge $numrestrictedscripts ]
     96         then
     97            currentnode=`echo $currentnode + 1 | bc -l`
     98            numevaluated=1
     99         else
     100            numevaluated=`echo $numevaluated + 1 | bc -l`
     101         fi
     102         if [ $currentnode -gt $maxnode ]
     103         then
     104            currentnode=$minnode
     105         fi
     106         # check if node is excluded
     107         for excludednode in ${excludednodes[@]}
     108         do
     109            if [ $currentnode -eq $excludednode ]
     110            then
     111               echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
     112               continue 2
     113            fi
     114         done
     115         # define requirement for submission
     116         # FIXME: currently only for sge at isdc
     117         echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
     118         noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
     119         noderequirementstat=" -l hostname=compute-0-${currentnode}"
     120         getstatus $currentnode >> $jmscriptlog 2>&1
     121      else
     122         noderequirementsub=""
     123         noderequirementstat=""
     124         getstatus >> $jmscriptlog 2>&1
     125      fi
     126     
     127      # check number of processes to be done
    86128      echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1
    87129      if [ "$numproc" = "" ]
     
    105147
    106148      # get processes in queue
    107 #      q=(`/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus 2>&1 `)
    108 #      q=(`/opt/gridengine/bin/lx26-amd64/qstat | awk ' { print "Owner"$4" " $3"Jobstatus"$5 } ' 2>&1 `)
    109149      q=(`checkqueue 2>&1 `)
    110150      if echo $q | egrep \(Error\|failed\)
     
    115155         nextscript sleeptime $sleeptime
    116156      fi
     157      # FIXME: sge cuts scriptname to 8 digits in qstat
    117158      # get processes of user in queue
    118       q1=(`echo ${q[@]} | egrep -o Owner$user`)
     159      q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
    119160      queued=${#q1[@]}
    120161      # get scripts in queue
    121       q2=(`echo ${q[@]} | egrep -o ${scripts[$i]}`)
     162      q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
    122163      queuedscript=${#q2[@]}
    123164      # get running scripts
    124       q3=( `echo ${q[@]} | egrep -o \(${scripts[$i]}Jobstatus2\|${scripts[$i]}Jobstatusr\)` )
     165      q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\)` )
    125166      runningscript=${#q3[@]}
    126167      stillinqueue=`echo $queuedscript - $runningscript | bc `
     
    171212      date=`date +%Y-%m-%d`
    172213      echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
    173 #      if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
    174 #      if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog
    175214      if ! queuesubmit 2>> $jmerrorlog
    176215      then
  • trunk/MagicSoft/Mars/datacenter/scripts/setup.isdc.cta

    r9489 r9492  
    8989sleeptimelimit=360 #360
    9090errorsleeptimedefault=60 #60
    91 max=500 #maximum number of processes
    92 max=25 #maximum number of processes for one script in case there are more than one and the others do not have anything to do
     91max=50 #maximum number of processes for one script in case there are more than one and the others do not have anything to do
    9392totalmax=1600 #maximum number of processes (total) overwrites pnototal(we) in case it is smaller
    9493
     
    106105pnototalwe=(     1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 1500 )
    107106
    108 pnosimtel=(       65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 )
    109 pnosimtelwe=(     65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 65 )
     107pnosimtel=(       50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 )
     108pnosimtelwe=(     50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 50 )
     109
     110# restricted to nodes => numbers per node
     111pnochimp=(         2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     112pnochimpwe=(       2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     113pnoctastar=(       2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     114pnoctastarwe=(     2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     115pnostereob=(       2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     116pnostereobwe=(     2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     117pnostereoc=(       2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     118pnostereocwe=(     2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     119pnostereog=(       2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
     120pnostereogwe=(     2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 )
    110121
    111122# set variables for jobmanager
    112 scripts=( "runsimtel" )
    113 scriptscolname=( "fCorsikaSimTelarray" )
    114 pnosweek=( ${pnosimtel[@]} )
    115 pnoswe=( ${pnosimtelwe[@]} )
     123#scripts=( "runsimtel" "runchimp" "runctastar" "runstereo 2" "runstereo 3" "runstereo 7" )
     124#scriptscolname=( "fCorsikaSimTelarray" "fChimp" "fCTAStar" "fStereoB" "fStereoC" "fStereoG" )
     125#pnosweek=( ${pnosimtel[@]}   ${pnochimp[@]}   ${pnoctastar[@]}   ${pnostereob[@]}   ${pnostereoc[@]}   ${pnostereog[@]} )
     126#pnoswe=( ${pnosimtelwe[@]} ${pnochimpwe[@]} ${pnoctastarwe[@]} ${pnostereobwe[@]} ${pnostereocwe[@]} ${pnostereogwe[@]} )
     127scripts=( "runsimtel" "runchimp" "runctastar" )
     128scriptscolname=( "fCorsikaSimTelarray" "fChimp" "fCTAStar" )
     129pnosweek=( ${pnosimtel[@]}   ${pnochimp[@]}   ${pnoctastar[@]} )
     130pnoswe=( ${pnosimtelwe[@]} ${pnochimpwe[@]} ${pnoctastarwe[@]} )
    116131
     132# number of script that are limited to a certain node
     133#numrestrictedscripts=5
     134numrestrictedscripts=2
     135
     136# set up for nodes
     137#  for processes that can run only on certain node
     138#  db: fProductionHostKEY
     139minnode=0
     140maxnode=26
     141excludednodes=( 7 14 23 )
  • trunk/MagicSoft/Mars/datacenter/scripts/sourcefile

    r9489 r9492  
    274274function getstepinfo()
    275275{
     276   getdbsetup
    276277   table=`grep "$column:" $steps | sed -e "s/[.]$column://" -e 's/#//' -e 's/ //g'`
    277278   coltab=`grep "$column:" $steps | sed -e 's/://' -e 's/#//' -e 's/ //g'`
    278279   needs=`grep "$coltab[.]Needs:" $steps | sed -e "s/$coltab[.]Needs://"`
     280   noderestricted=`grep "$coltab[.]NodeRestricted:" $steps | sed -e "s/$coltab[.]NodeRestricted://" -e 's/ //g'`
    279281   influences=`grep "$coltab[.]Influences:" $steps | sed -e "s/$coltab[.]Influences://"`
    280282   prims=( `grep "$table[.]Primary:" $steps | sed -e "s/$table[.]Primary://"` )
     
    282284#   echo " needs: $needs"
    283285#   echo " influences: $influences"
     286#   echo " noderestricted: $noderestricted"
    284287#   echo " prims: ${prims[@]}"
    285288}
     
    290293   process=
    291294   printprocesslog "INFO getting todo..."
    292    getdbsetup
    293295   getstepinfo
    294296   # get query
     
    340342{
    341343   numproc=
    342    getdbsetup
    343344   getstepinfo
    344345   # get query
     
    366367      continue
    367368   fi
     369   if [ "$numproc" = "" ]
     370   then
     371      numproc=0
     372   fi
    368373}
    369374
     
    377382   resetstatusvalues
    378383   evalstatus $@
    379    getdbsetup
    380384   getstepinfo
    381385   # get query
Note: See TracChangeset for help on using the changeset viewer.