#!/bin/sh
#
# ========================================================================
#
# *
# * This file is part of MARS, the MAGIC Analysis and Reconstruction
# * Software. It is distributed to you in the hope that it can be a useful
# * and timesaving tool in analysing Data of imaging Cerenkov telescopes.
# * It is distributed WITHOUT ANY WARRANTY.
# *
# * Permission to use, copy, modify and distribute this software and its
# * documentation for any purpose is hereby granted without fee,
# * provided that the above copyright notice appear in all copies and
# * that both that copyright notice and this permission notice appear
# * in supporting documentation. It is provided "as is" without express
# * or implied warranty.
# *
#
#
#   Author(s): Daniela Dorner  05/2006 <mailto:dorner@astro.uni-wuerzburg.de>
#
#   Copyright: MAGIC Software Development, 2000-2009
#
#
# ========================================================================
#
# This a script, which launches other scripts (all scripts, that are run 
# on primary basis
#

source `dirname $0`/sourcefile
printprocesslog "INFO starting $0"

set -C

# function to continue in loop and go to next script
function nextscript()
{
   echo `date +%F\ %T`" sleeping \$$1 = $sleeptime seconds... " >> $jmscriptlog 2>&1
   sleep $2
   echo "" >> $jmscriptlog 2>&1
   continue
}

echo "" >> $jmscriptlog 2>&1
echo "" >> $jmscriptlog 2>&1
echo -n `date +%F\ %T`" starting jobmanager for setup "$AUTOMATIONSETUP >> $jmscriptlog 2>&1

# choose commands according to queueing system (defined in setup)
case $queuesys in
      sge)  echo " on queuing system 'sun grid engine'" >> $jmscriptlog 2>&1
            # (-hard) -l hostname=compute-*
            #   for qstat this returns the jobs running on that node + all jobs in the queue
            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $noderequirementsub` `echo $scriptspath`/`echo ${scripts[$i]}` '
#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -b y -v AUTOMATIONSETUP=$AUTOMATIONSETUP -e `echo $runlogpath`/error-`echo $date`.log -o `echo $runlogpath`/log-`echo $date`.log `echo $scriptspath`/`echo ${scripts[$i]}` '
#            alias 'queuesubmit'='/opt/gridengine/bin/lx26-amd64/qsub -sc runlogpath=`echo $runlogpath` -sc date=`echo $date` -sc scriptspath=`echo $scriptspath` -sc script=`echo ${scripts[$i]}` `echo $scriptspath`/job.sge '
            # FIXME: get complete scriptname (including command line option), needed for runstereo
            alias 'checkqueue'="/opt/gridengine/bin/lx26-amd64/qstat \`echo \$noderequirementstat\`  | awk ' { print \"Owner\"\$4\" \" \$3\"Jobstatus\"\$5 } '"
            break
            ;;
   condor)  echo " on queuing system 'condor'" >> $jmscriptlog 2>&1
            alias 'queuesubmit'='/usr/local/bin/condor_submit -a path=`echo $scriptspath` -a prog=`echo ${scripts[$i]}` -a date=`echo $date` -a dir=`echo $runlogpath` -a automationsetup=$AUTOMATIONSETUP `echo $scriptspath`/run.condor'
            alias 'checkqueue'='/usr/local/bin/condor_q -global -format "Owner%s " Owner -format "%s" CMD -format "Jobstatus%s\n" Jobstatus '
            break
            ;;
        *)  echo "" >> $jmscriptlog 2>&1
            finish >> $jmscriptlog 2>&1
            ;;
esac

echo "" >> $jmscriptlog 2>&1

prev=$max
user=`whoami`
currentnode=$minnode
numevaluated=0
# endless loop
notcount=0
nothingtodocount=0
nothingtodosleeptime=0
errorsleeptime=$errorsleeptimedefault
while (( $notcount < 100 ))
do
   for (( i=0 ; i < ${#scripts[@]} ; i++ ))
   do 
      source `dirname $0`/sourcefile
      echo `date +%F\ %T`" Evaluating processing status for script '${scripts[$i]}'" >> $jmscriptlog 2>&1

      # check if there's something to do
      column=${scriptscolname[$i]}
      getstepinfo
      if [ "$noderestricted" = "yes" ]
      then
         # get number of next node 
         if [ $numevaluated -ge $numrestrictedscripts ]
         then 
            currentnode=`echo $currentnode + 1 | bc -l`
            numevaluated=1
         else
            numevaluated=`echo $numevaluated + 1 | bc -l`
         fi
         if [ $currentnode -gt $maxnode ]
         then 
            currentnode=$minnode
         fi
         # check if node is excluded
         for excludednode in ${excludednodes[@]}
         do
            if [ $currentnode -eq $excludednode ]
            then
               echo `date +%F\ %T`" Node compute-0-$currentnode is currently excluded." >> $jmscriptlog 2>&1
               continue 2
            fi
         done
         # define requirement for submission 
         # FIXME: currently only for sge at isdc
         echo `date +%F\ %T`" Checking for node $currentnode. " >> $jmscriptlog 2>&1
         noderequirementsub=" -hard -l hostname=compute-0-${currentnode}"
         noderequirementstat=" -l hostname=compute-0-${currentnode}"
         getstatus $currentnode >> $jmscriptlog 2>&1
      else
         noderequirementsub=""
         noderequirementstat=""
         getstatus >> $jmscriptlog 2>&1
      fi
      
      # check number of processes to be done
      echo `date +%F\ %T`" Database: $numproc ${scripts[$i]} still to be done (incl. idle jobs) [DB/table/column $db/$table/$column]" >> $jmscriptlog 2>&1
      if [ "$numproc" = "" ]
      then 
         prev=0
         nothingtodocount=`expr $nothingtodocount + 1`
         if [ $nothingtodocount -lt ${#scripts[@]} ]
         then 
            nextscript 0 0
         else
            if [ $nothingtodosleeptime -lt $sleeptimelimit ]
            then 
               nothingtodosleeptime=`echo " $nothingtodocount * $sleeptime " | bc`
            fi
            nextscript nothingtodosleeptime $nothingtodosleeptime
         fi
      else
         nothingtodocount=0
         nothingtodosleeptime=0
      fi

      # get processes in queue
      q=(`checkqueue 2>&1 `)
      if echo $q | egrep \(Error\|failed\)
      then 
         echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmscriptlog 2>&1
         printprocesslog "WARN checking query ($queuesys) failed"
         echo `date +%F\ %T`" WARN checking query ($queuesys) failed" >> $jmerrorlog
         nextscript sleeptime $sleeptime
      fi
      # FIXME: sge cuts scriptname to 8 digits in qstat
      # get processes of user in queue
      q1=( `echo ${q[@]} | egrep -o "Owner$user"`)
      queued=${#q1[@]}
      # get scripts in queue
      q2=( `echo ${q[@]} | egrep -o "${scripts[$i]}"`)
      queuedscript=${#q2[@]}
      # get running scripts
      q3=( `echo ${q[@]} | egrep -o \("${scripts[$i]}"Jobstatus2\|"${scripts[$i]}"Jobstatusr\)` )
      runningscript=${#q3[@]}
      stillinqueue=`echo $queuedscript - $runningscript | bc `

      #get total number of allowed process for current time
      hour=`date +%k`
      if [ ${pnototal[$hour]} -lt $totalmax ]
      then
         totalpno=${pnototal[$hour]}
      else
         totalpno=$totalmax
      fi
      
      #choose array according to the day of the week
      dayofweek=`date +%u`
      case $dayofweek in
         0 | 6)  pnos=( ${pnoswe[@]} ) ;;
             *)  pnos=( ${pnosweek[@]} ) ;;
      esac
      # get number of allowed scripts for current time
      num=`echo "((( $i + 1 ) * 24 ) + ( $hour + 1 ) ) - 24 - 1 " | bc `
      pnoscript=${pnos[$num]}
      # if there was nothing to do for previous script, more scripts can be allowed
      if [ $prev -eq 0 ]
      then
         echo `date +%F\ %T`" \$prev=0 => resetting \$pnoscript from $pnoscript to $max [\$max]" >> $jmscriptlog 2>&1
         pnoscript=$max
      fi
      echo `date +%F\ %T`" queue for user '$user': total: $queued queued jobs [allowed \$totalpno = $totalpno]" >> $jmscriptlog 2>&1
      echo `date +%F\ %T`" queue for user '$user': ${scripts[$i]}: $queuedscript queued, $runningscript running, $stillinqueue idle [allowed \$pnoscript = $pnoscript]" >> $jmscriptlog 2>&1
      
      # continue if there are already enough processes or scripts in the queue
      if [ "$queued" -ge "$totalpno" ] || [ "$queuedscript" -ge "$pnoscript" ]
      then
         nextscript sleeptime $sleeptime
      fi
      # continue if the number of script is the queue is larger (or equal) than the number which still has to be done
      if [ $numproc -le $stillinqueue ]
      then 
         echo `date +%F\ %T`" \$numproc ($numproc) <  \$stillinqueue ($stillinqueue) " >> $jmscriptlog 2>&1
         nextscript sleeptime $sleeptime
      fi
      
      # reset prev
      prev=$max

      # submit 1 script to queuing system
      date=`date +%Y-%m-%d`
      echo `date +%F\ %T`" committing 1 ${scripts[$i]} to $queuesys" >> $jmscriptlog 2>&1
      if ! queuesubmit 2>> $jmerrorlog
      then 
         echo `date`" WARN submitting job ($queuesys) failed" >> $jmerrorlog
         echo `date +%F\ %T`" WARN $queuesys is not working -> sleeping $errorsleeptime [\$errorsleeptime]" >> $jmscriptlog 2>&1
         printprocesslog "WARN submitting ${scripts[$i]} ($queuesys) failed"
         if [ $errorsleeptime -lt $sleeptimelimit ]
         then 
            errorsleeptime=`echo " $errorsleeptime + $errorsleeptimedefault " | bc`
         fi
         nextscript errorsleeptime $errorsleeptime
      else
         errorsleeptime=$errorsleeptimedefault
      fi
      nextscript sleeptime $sleeptime
   done
done

