Ignore:
Timestamp:
10/27/10 17:46:20 (14 years ago)
Author:
Daniela Dorner
Message:
new algorithm
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/Mars/datacenter/scripts/jobmanager

    r10014 r10038  
    3535
    3636# function to continue in loop and go to next script
    37 function nextscript()
     37function sleepawhile()
    3838{
    39    echo `date +%F\ %T`" sleeping \$$1 = $2 seconds... " >> $jmscriptlog 2>&1
    40    sleep $2
     39   usedsleeptime=$sleeptime
     40   case $1 in
     41      "error") if ! [ "$errorsleeptime" = "" ]
     42               then
     43                  if [ $errorsleeptime -lt $sleeptimelimit ]
     44                  then
     45                     errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
     46                  fi
     47                  usedsleeptime=$errorsleeptime
     48               fi
     49               ;;
     50         "ok") errorsleeptime=$errorsleeptimedefault
     51               ;;
     52   esac
     53   echo `date +%F\ %T`" sleeping "$usedsleeptime" seconds... (status: "$1")" >> $jmscriptlog 2>&1
    4154   echo "" >> $jmscriptlog 2>&1
     55   sleep $usedsleeptime
    4256   continue
    4357}
     
    5468            # (-hard) -l hostname=compute-*
    5569            #   for qstat this returns the jobs running on that node + all jobs in the queue
    56             alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
     70            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -t 1-`echo $tosubmit` -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
    5771#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
    5872#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
     
    6276            ;;
    6377      pbs)  echo " on queuing system 'pbs'" >> $jmscriptlog 2>&1
    64             alias 'queuesubmit'='$pbspath/qsub -l walltime=$walltime -l pmem=$pmem -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
     78            alias 'queuesubmit'='$pbspath/qsub -t 1-`echo $tosubmit` -l walltime=`echo $walltime` -l pmem=`echo $pmem` -v AUTOMATIONSETUP=$AUTOMATIONSETUP,SOURCEFILEPATH=$SOURCEFILEPATH,SCRIPTNAME=`echo ${scripts[$i]}` -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
    6579            # check queue (restricted to current user only)
    6680            alias 'checkqueue'="$pbspath/qstat -a -u $user | awk ' { print \"Owner\"\$2\" \" \$4\"Jobstatus\"\$10 } '"
     
    6882            ;;
    6983   condor)  echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
    70             alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
     84            alias 'queuesubmit'='$condorpath/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a num=`echo $tosubmit` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
    7185            alias 'checkqueue'='$condorpath/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
    72             break
     86            break 
    7387            ;;
    7488        *)  echo "" >> $jmscriptlog 2>&1
     
    7993echo "" >> $jmscriptlog 2>&1
    8094
    81 prev=$max
     95# for processing with local storage on different nodes
    8296currentnode=$minnode
    8397numevaluated=0
     98
    8499# endless loop
    85100notcount=0
    86 nothingtodocount=0
    87 nothingtodosleeptime=0
    88101errorsleeptime=$errorsleeptimedefault
    89102while (( $notcount < 100 ))
    90103do
     104   # get and set some information for the processing
     105   source `dirname $0`/sourcefile
     106   # reset some values
     107   tosubmit=0
     108   idleratio=0
     109   addtoscript=
     110   
     111   # get processes in queue
     112   q=(`checkqueue 2>&1 `)
     113   if echo $q | egrep \(Error\|failed\)
     114   then
     115      echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
     116      printprocesslog "WARN checking query ($queuesys) failed"
     117      echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
     118      sleepawhile "error"
     119   fi
     120
     121   # general check whether one should submit something depending on chosen algorithm
     122   # algorithm 1:
     123   #   submit new jobs in case there are less than $limitidle idle jobs in the queue
     124   # algorithm 2:
     125   #   submit new jobs in case the total number of jobs in the queue has fallen below $totalpno
     126   case $algorithm in
     127      1) # algorithm 1
     128         # get number of idle jobs in the queue
     129         q5=( `echo ${q[@]} | egrep -o \(Jobstatus1\|Jobstatusq\|JobstatusQ\)` )
     130         idle=${#q5[@]}
     131         if [ $idle -gt $limitidle ]
     132         then
     133            echo `date +%F\ %T`" more than "$limitidle" jobs waiting ("$idle")" >> $jmscriptlog 2>&1
     134            sleepawhile "ok"
     135         fi
     136         ;;
     137      2) # algorithm 2
     138         # get processes of user in queue
     139         q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
     140         queued=${#q1[@]}
     141         hour=`date +%k`
     142         # choose array of total number of jobs to be done
     143         #   according to the day of the week
     144         dayofweek=`date +%u`
     145         case $dayofweek in
     146            0 | 6)  totalpno=${pnototal[$hour]} ;;
     147                *)  totalpno=${pnototalwe[$hour]} ;;
     148         esac
     149         # get total number of jobs to be submitted
     150         if [ $queued -gt $totalpno ]
     151         then
     152            echo `date +%F\ %T`" more than "$totalpno" jobs waiting ("$queued")" >> $jmscriptlog 2>&1
     153            sleepawhile "ok"
     154         else
     155            tosubmittotal=`echo "$totalpno - $queued" | bc -l`
     156         fi
     157         ;;
     158      *) echo "Please give an algorithm to calculate the number of allowed jobs."
     159         exit
     160         ;;
     161   esac
     162   echo `date +%F\ %T`" Total number of jobs to be submitted: "$tosubmittotal >> $jmscriptlog 2>&1
     163
     164
     165   # first loop to determine
     166   # a) how many jobs of this script have to be done
     167   # b) how many jobs of this script are running or queued
    91168   for (( i=0 ; i < ${#scripts[@]} ; i++ ))
    92169   do
    93       source `dirname $0`/sourcefile
    94       echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1
    95 
    96       # check if there's something to do
     170      # set the step to be evaluated
    97171      step=${scriptscolname[$i]}
    98172      getstepinfo
    99       # check if walltime has to be set
    100       if [ "$setwalltime" = "yes" ]
    101       then
    102          walltime=${walltimes[$i]}
    103       fi
    104       # check if memory has to be set
    105       if [ "$setpmem" = "yes" ]
    106       then
    107          pmem=${pmems[$i]}
    108       fi
     173
    109174      # check if the script is restricted to one node
    110175      #   (i.e. where output of previous step(s) is stored)
    111       # this information is taken from the steps.rc file
     176      #   this information is taken from the steps.rc file
     177      #   currently this is implemented for sge only
     178      # then get number of jobs to be done
    112179      if [ "$noderestricted" = "yes" ]
    113180      then
     
    115182         if [ $numevaluated -ge $numrestrictedscripts ]
    116183         then
    117             currentnode=`echo $currentnode + 1 | bc -l`
     184            currentnode=`echo " $currentnode + 1 " | bc -l`
    118185            numevaluated=1
    119186         else
    120             numevaluated=`echo $numevaluated + 1 | bc -l`
     187            numevaluated=`echo " $numevaluated + 1 " | bc -l`
    121188         fi
    122189         if [ $currentnode -gt $maxnode ]
     
    138205         noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
    139206         noderequirementstat=" -l hostname=compute-0-${currentnode}"
     207         # get number of jobs to be done from the DB
    140208         getstatus $currentnode >> $jmscriptlog 2>&1
    141209      else
    142210         noderequirementsub=""
    143211         noderequirementstat=""
     212         # get number of jobs to be done from the DB
    144213         getstatus >> $jmscriptlog 2>&1
    145214      fi
     215      # store the number of processes to be done for this script
     216      todo[$i]=$numproc
     217      tododb[$i]=$numproc
    146218     
    147       # check number of processes to be done
    148       echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/step $db/$step]" >> $jmscriptlog 2>&1
    149       if [ "$numproc" = "0" ]
     219      # FIXME: sge cuts scriptname to 8 digits in qstat
     220      # number of idle jobs, i.e. jobs waiting in the queue to run
     221      #   condor: 1
     222      #   sge: q
     223      #   pbs: Q
     224      q4=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus1\|"${scripts[$i]}"Jobstatusq\|"${scripts[$i]}"JobstatusQ\)` )
     225      idlescript[$i]=${#q4[@]}
     226
     227      q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
     228      queuedscript[$i]=${#q2[@]}
     229
     230      stillfree[$i]=`echo "${maxjobs[$i]} - ${queuedscript[$i]} " | bc -l`
     231
     232      if [ $numproc -eq 0 ] || [ ${todo[$i]} -le ${idlescript[$i]} ] || [ ${maxjobs[$i]} -le ${queuedscript[$i]} ]
    150233      then
    151          prev=0
    152          nothingtodocount=`expr $nothingtodocount + 1`
    153          if [ $nothingtodocount -lt ${#scripts[@]} ]
    154          then
    155             nextscript 0 0
    156          else
    157             if [ $nothingtodosleeptime -lt $sleeptimelimit ]
     234         # store the fraction of cpus to add it to another process
     235         idleratio=`echo " ${ratio[$i]} + $idleratio " | bc -l`
     236         ratio[$i]=0
     237         todo[$i]=0
     238         idlenum=$i
     239         continue
     240      fi
     241   done
     242   echo `date +%F\ %T`" Evaluated scripts: "${scripts[@]} >> $jmscriptlog 2>&1
     243   echo `date +%F\ %T`" Running scripts: "${queuedscript[@]}" (max: "${maxjobs[@]}")" >> $jmscriptlog 2>&1
     244   echo `date +%F\ %T`" Number of jobs to be done (from DB): "${tododb[@]} >> $jmscriptlog 2>&1
     245   echo `date +%F\ %T`" Number of jobs to be done (updated): "${todo[@]} >> $jmscriptlog 2>&1
     246   echo `date +%F\ %T`" Ratio: "${ratio[@]}" (idle: "$idleratio")" >> $jmscriptlog 2>&1
     247   
     248   # loop to update the ratio taking into account the ratio of
     249   #   a) steps where nothing has to done
     250   #   b) steps where already enough jobs are in the queue
     251   # sum up this idle ratio
     252   # determine for which step still most jobs have to be done
     253   if ! [ "$idleratio" = "0" ]
     254   then
     255      addtoscript=
     256      for (( i=0 ; i < ${#scripts[@]} ; i++ ))
     257      do
     258         if [ ${todo[$i]} -gt ${todo[$idlenum]} ] && [ ${todo[$i]} -gt 0 ]
     259         then
     260            if ! [ "$addtoscript" = "" ]
    158261            then
    159                nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
     262               if [ ${todo[$i]} -lt ${todo[$addtoscript]} ]
     263               then
     264                  continue
     265               fi
    160266            fi
    161             nextscript nothingtodosleeptime $nothingtodosleeptime
    162          fi
     267            addtoscript=$i
     268         fi
     269      done
     270     
     271      # continue in case nothing has to be done for all steps
     272      # else: update the ratio for the step where most jobs have to be done
     273      #   by adding the idle ratio
     274      if [ "$addtoscript" = "" ]
     275      then
     276         echo `date +%F\ %T`" No jobs to be done for any step." >> $jmscriptlog 2>&1
     277         sleepawhile "ok"
    163278      else
    164          nothingtodocount=0
    165          nothingtodosleeptime=0
    166       fi
    167 
    168       # get processes in queue
    169       q=(`checkqueue 2>&1 `)
    170       if echo $q | egrep \(Error\|failed\)
    171       then
    172          echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
    173          printprocesslog "WARN checking query ($queuesys) failed"
    174          echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
    175          nextscript sleeptime $sleeptime
    176       fi
    177       # FIXME: sge cuts scriptname to 8 digits in qstat
    178       # get processes of user in queue
    179       q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
    180       queued=${#q1[@]}
    181       # get number of scripts in queue
    182       q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
    183       queuedscript=${#q2[@]}
    184       # get running scripts
    185       #   condor: 2
    186       #   sge: r
    187       #   pbs: R
    188       q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\|"${scripts[$i]}"JobstatusR\)` )
    189       runningscript=${#q3[@]}
    190       stillinqueue=`echo $queuedscript - $runningscript | bc `
    191 
    192       #get total number of allowed process for current time
    193       hour=`date +%k`
    194       if [ ${pnototal[$hour]} -lt $totalmax ]
    195       then
    196          totalpno=${pnototal[$hour]}
    197       else
    198          totalpno=$totalmax
     279         ratio[$addtoscript]=`echo " ${ratio[$addtoscript]} + $idleratio " | bc -l`
     280      fi
     281   fi
     282   echo `date +%F\ %T`" Updated ratio: "${ratio[@]} >> $jmscriptlog 2>&1
     283   
     284   
     285   # loop to submit jobs to queueing system
     286   for (( i=0 ; i < ${#scripts[@]} ; i++ ))
     287   do
     288      # calculate number of jobs to be submitted
     289      tosubmit=`echo "scale=0; $tosubmittotal * ${ratio[$i]} / 1 " | bc -l`
     290      if [ ${todo[$i]} -lt $tosubmit ]
     291      then
     292         echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${todo[$i]} >> $jmscriptlog 2>&1
     293         tosubmit=${todo[$i]}
     294      fi
     295      if [ $tosubmit -eq 0 ]
     296      then
     297         echo `date +%F\ %T`" No jobs to be submitted for script '"${scripts[$i]}"'" >> $jmscriptlog 2>&1
     298         continue
     299      fi
     300      if [ $tosubmit -gt ${stillfree[$i]} ]
     301      then
     302         echo `date +%F\ %T`" Updating tosubmit for "${scripts[$i]}" from "$tosubmit" to "${stillfree[$i]} >> $jmscriptlog 2>&1
     303         tosubmit=${stillfree[$i]}
    199304      fi
    200305     
    201       #choose array according to the day of the week
    202       dayofweek=`date +%u`
    203       case $dayofweek in
    204          0 | 6)  pnos=( ${pnoswe[@]} ) ;;
    205              *)  pnos=( ${pnosweek[@]} ) ;;
    206       esac
    207       # get number of allowed scripts for current time
    208       num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
    209       pnoscript=${pnos[$num]}
    210       # if there was nothing to do for previous script, more scripts can be allowed
    211       if [ $prev -eq 0 ]
    212       then
    213          echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
    214          pnoscript=$max
    215       fi
    216       echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
    217       echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
    218      
    219       # continue if there are already enough processes or scripts in the queue
    220       if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
    221       then
    222          nextscript sleeptime $sleeptime
    223       fi
    224       # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
    225       if [ $numproc -le $stillinqueue ]
    226       then
    227          echo `date +%F\ %T`" \$numproc ($numproc) <  \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
    228          nextscript sleeptime $sleeptime
    229       fi
    230      
    231       # reset prev
    232       prev=$max
    233 
    234       # submit 1 script to queuing system
     306      # set the step to be evaluated
     307      step=${scriptscolname[$i]}
     308      # check if walltime has to be set
     309      if [ "$setwalltime" = "yes" ]
     310      then
     311         walltime=${walltimes[$i]}
     312      fi
     313      # check if memory has to be set
     314      if [ "$setpmem" = "yes" ]
     315      then
     316         pmem=${pmems[$i]}
     317      fi
     318
     319      # submit $tosubmit scripts to queuing system
     320      echo `date +%F\ %T`" Submitting "$tosubmit" jobs for script '"${scripts[$i]}"' to "$queuesys >> $jmscriptlog 2>&1
    235321      date=`date +%Y-%m-%d`
    236       echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
    237322      if ! queuesubmit 2>> $jmerrorlog
    238323      then
     
    240325         echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
    241326         printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
    242          if [ $errorsleeptime -lt $sleeptimelimit ]
    243          then
    244             errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
    245          fi
    246          nextscript errorsleeptime $errorsleeptime
    247       else
    248          errorsleeptime=$errorsleeptimedefault
    249       fi
    250       nextscript sleeptime $sleeptime
     327         sleepawhile "error"
     328      fi
    251329   done
     330   sleepawhile "ok"
    252331done
    253332
Note: See TracChangeset for help on using the changeset viewer.