Changeset 9488 for trunk/MagicSoft


Ignore:
Timestamp:
08/12/09 18:56:02 (15 years ago)
Author:
Daniela Dorner
Message:
*** empty log message ***
Location:
trunk/MagicSoft/Mars
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/MagicSoft/Mars/Changelog

    r9487 r9488  
    1818
    1919                                                 -*-*- END OF LINE -*-*-
     20
     21 2009/08/12 Daniela Dorner
     22
     23   * datacenter/scripts/jobmanager:
     24     - improved logging (new layout and more information)
     25     - moved sleep to end of loop (function nextscript())
     26
     27
    2028
    2129 2009/08/12 Daniel Hoehne-Moench
  • trunk/MagicSoft/Mars/datacenter/scripts/jobmanager

    r9474 r9488  
    3434set -C
    3535
     36# function to continue in loop and go to next script
     37function nextscript()
     38{
     39   echo `date +%F\ %T`" sleeping \$$1 = $sleeptime seconds... " >> $jmscriptlog 2>&1
     40   sleep $2
     41   echo "" >> $jmscriptlog 2>&1
     42   continue
     43}
     44
    3645echo "" >> $jmscriptlog 2>&1
    37 echo "starting jobmanager ("`date`")" >> $jmscriptlog 2>&1
     46echo "" >> $jmscriptlog 2>&1
     47echo -n `date +%F\ %T`" starting jobmanager for setup " >> $jmscriptlog 2>&1
    3848
    3949# decide which jobmanager you want to run
     
    4151#  the number of jobs are defined in the file setup
    4252case $1 in
    43    data) echo "running jobmanager for data" >> $jmscriptlog 2>&1
     53   data) echo -n "'data'" >> $jmscriptlog 2>&1
    4454         scripts=( "runganymed" "runstar" "runcallisto" ) # not used: "dodatacheck" "cutslices"
    4555         scriptscolname=( "fGanymed" "fStar" "fCallisto" ) # not used: "fDataCheckDone" "fCompmux"
     
    4858         break
    4959         ;;
    50      mc) echo "running jobmanager for mc" >> $jmscriptlog 2>&1
     60     mc) echo -n "'mc'" >> $jmscriptlog 2>&1
    5161         scripts=( "runcorsika" "runreflector" "runcamera" )
    5262         scriptscolname=( "fCorsikaFileAvail" "fReflectorFileAvail" "fCameraFileAvail" )
     
    5565         break
    5666         ;;
    57   ctamc) echo "running jobmanager for cta mc" >> $jmscriptlog 2>&1
     67  ctamc) echo -n "'cta mc'" >> $jmscriptlog 2>&1
    5868         scripts=( "runsimtel" )
    5969         scriptscolname=( "fCorsikaSimTelarray" )
     
    6272         break
    6373         ;;
    64       *) echo "'$1' is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1
     74      *) echo ""
     75         echo "'$1' is a wrong commandline option for jobmanager -> exit" >> $jmscriptlog 2>&1
    6576         echo "'$1' is a wrong commandline option for jobmanager -> exit"
    6677         printprocesslog "WARN '$1' is a wrong commandline option for jobmanager"
     
    7283# choose commands according to queueing system (defined in setup)
    7384case $queuesys in
    74       sge)  echo "setting commands for sun grid engine" >> $jmscriptlog 2>&1
     85      sge)  echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
    7586            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
    7687#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
     
    7889            break
    7990            ;;
    80    condor)  echo "setting commands for condor" >> $jmscriptlog 2>&1
     91   condor)  echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
    8192            alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` `echo $scriptspath`/run.condor'
    8293            alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
    8394            break
    8495            ;;
    85         *)  finish >> $jmscriptlog 2>&1
     96        *)  echo "" >> $jmscriptlog 2>&1
     97            finish >> $jmscriptlog 2>&1
    8698            ;;
    8799esac
     100
     101echo "" >> $jmscriptlog 2>&1
    88102
    89103prev=$max
     
    98112   for (( i=0 ; i < ${#scripts[@]} ; i++ ))
    99113   do
    100       date  >> $jmscriptlog 2>&1
    101114      source `dirname $0`/sourcefile
    102       echo "script: ${scripts[$i]}" >> $jmscriptlog 2>&1
     115      echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
    103116
    104117      # check if there's something to do
    105118      column=${scriptscolname[$i]}
    106119      getstatus >> $jmscriptlog 2>&1
    107       echo " $numproc ${scripts[$i]} still to do" >> $jmscriptlog 2>&1
     120      echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1
    108121      if [ "$numproc" = "" ]
    109122      then
     
    112125         if [ $nothingtodocount -lt ${#scripts[@]} ]
    113126         then
    114             cont >> $jmscriptlog 2>&1
     127            nextscript 0 0
    115128         else
    116129            if [ $nothingtodosleeptime -lt $sleeptimelimit ]
     
    118131               nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
    119132            fi
    120             echo "sleeping $nothingtodosleeptime" >> $jmscriptlog 2>&1
    121             sleep $nothingtodosleeptime
    122             cont >> $jmscriptlog 2>&1
     133            nextscript nothingtodosleeptime $nothingtodosleeptime
    123134         fi
    124135      else
     
    126137         nothingtodosleeptime=0
    127138      fi
    128 
    129       echo "sleeping $sleeptime..." >> $jmscriptlog 2>&1
    130       sleep $sleeptime
    131139
    132140      # get processes in queue
     
    136144      if echo $q | egrep \(Error\|failed\)
    137145      then
    138          echo `date`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
     146         echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
    139147         printprocesslog "WARN checking query ($queuesys) failed"
    140          echo `date`" WARN checking query ($queuesys) failed" >> $jmerrorlog
    141          cont >> $jmscriptlog 2>&1
     148         echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
     149         nextscript sleeptime $sleeptime
    142150      fi
    143151      # get processes of user in queue
     
    154162      #get total number of allowed process for current time
    155163      hour=`date +%k`
    156       #totalpno=${pnototal[$hour]}
    157164      if [ ${pnototal[$hour]} -lt $totalmax ]
    158165      then
     
    174181      if [ $prev -eq 0 ]
    175182      then
    176          echo " prev=0 => resetting pnoscript [$pnoscript] to max [$max]" >> $jmscriptlog 2>&1
     183         echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
    177184         pnoscript=$max
    178185      fi
    179       echo " found $queued jobs in the queue (incl. running jobs) [allowed $totalpno]" >> $jmscriptlog 2>&1
    180       echo " found $queuedscript ${scripts[$i]} in the queue (incl. running jobs [$runningscript]) [allowed $pnoscript] - not running: $stillinqueue" >> $jmscriptlog 2>&1
     186      echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
     187      echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
    181188     
    182189      # continue if there are already enough processes or scripts in the queue
    183190      if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
    184191      then
    185          cont >> $jmscriptlog 2>&1
     192         nextscript sleeptime $sleeptime
    186193      fi
    187194      # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
    188195      if [ $numproc -le $stillinqueue ]
    189196      then
    190          echo " numproc($numproc) -le stillinqueue($stillinqueue)" >> $jmscriptlog 2>&1
    191          cont >> $jmscriptlog 2>&1
     197         echo `date +%F\ %T`" \$numproc ($numproc) <  \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
     198         nextscript sleeptime $sleeptime
    192199      fi
    193200     
     
    197204      # submit 1 script to queuing system
    198205      date=`date +%Y-%m-%d`
    199       echo " committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
     206      echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
    200207#      if ! /usr/local/bin/condor_submit -a path=$scriptspath -a prog=${scripts[$i]} -a date=$date -a dir=$runlogpath $scriptspath/run.condor 2>> $jmerrorlog
    201208#      if ! /opt/gridengine/bin/lx26-amd64/qsub -e $runlogpath/error-$date.log -o $runlogpath/log-$(date).log 2>> $jmerrorlog
     
    203210      then
    204211         echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
    205          echo "$queuesys is not working -> sleeping $errorsleeptime" >> $jmscriptlog 2>&1
     212         echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
    206213         printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
    207214         if [ $errorsleeptime -lt $sleeptimelimit ]
     
    209216            errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
    210217         fi
    211          sleep $errorsleeptime
     218         nextscript errorsleeptime $errorsleeptime
    212219      else
    213220         errorsleeptime=$errorsleeptimedefault
    214221      fi
    215       date >> $jmscriptlog 2>&1
    216       echo "" >> $jmscriptlog 2>&1
     222      nextscript sleeptime $sleeptime
    217223   done
    218224done
Note: See TracChangeset for help on using the changeset viewer.