#!/bin/bash
#
# This script checks the status tables in the DB for failed and crashed jobs
#

source `dirname $0`/../Sourcefile.sh
printprocesslog "INFO starting $0"

tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilled CalcSource )

printjobs()
{
   echo "The following "$2" jobs are "$1": "
   sendquery
   echo ""
   echo "to view: "$query
   echo ""
   updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where
   echo "to reset: "$updquery
}

for step in ${tables[@]}
do
   getstepinfo
   counter=0
   for prim in ${prims[@]}
   do
      if [ $counter -eq 0 ]
      then
         selstart=" CONCAT("$prim
      fi
      if [ $counter -gt 0 ]
      then
         selstart=$selstart" '_', LPAD("$prim", 3, 0)"
      fi
      counter=`echo $counter + 1 | bc -l`
      #if [ $counter -eq ${#prims[@]} ]
      #then
      #   sel=$sel")"
      #fi
   done
   
   # check table for failed jobs
   printprocesslog "INFO checking "$step" for failed jobs."
   where=" WHERE NOT ISNULL(fReturnCode) "
   # check only last 24h
   #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"              
   # last 3 days
   where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)"              
   query="SELECT Count(*) FROM "$step"Status "$where
   num=`sendquery`
   if [ "$num" == "" ]
   then 
      printprocesslog "WARN could not get number of failed jobs from the DB."
   else
      if [ $num -gt 0 ]
      then
         printprocesslog "WARN found in "$step" "$num" rows with errors."
         sel=$selstart", '(', fReturnCode, ')')"
         query="SELECT "$sel" FROM "$step"Status "$where
         printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs
         printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
      fi
   fi

   # check table for crashed jobs
   #   i.e. jobs which are running > 2 hours
   # check the last 27 hours
   printprocesslog "INFO checking "$step" for crashed jobs."
   where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
   # all crashed jobs 
   where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)"
   where=$where" AND fStartTime > '1971-01-01 01:01:01' "
   # only the crashed jobs of the last 24h
   #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
   #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
   query="SELECT Count(*) FROM "$step"Status "$where
   num=`sendquery`
   if [ "$num" == "" ]
   then 
      printprocesslog "WARN could not get number of crashed jobs from the DB."
   else
      if [ $num -gt 0 ]
      then
         printprocesslog "WARN found in "$step" "$num" crashed jobs."
         sel=$selstart", '(', fStartTime, ')')"
         query="SELECT "$sel" FROM "$step"Status "$where
         printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs
         printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
      fi
   fi
   
   # check for jobs with strange status
   #  i.e. jobs with startime NULL and valid stoptime
   #  (probably they have been reset while a job was running)
   printprocesslog "INFO checking "$step" for alien jobs."
   where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) "
   query="SELECT Count(*) FROM "$step"Status "$where
   num=`sendquery`
   if [ "$num" == "" ]
   then 
      printprocesslog "WARN could not get number of alien jobs from the DB."
   else
      if [ $num -gt 0 ]
      then
         printprocesslog "WARN found in "$step" "$num" alien jobs."
         sel=$selstart")"
         query="SELECT "$sel" FROM "$step"Status "$where
         printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs
         printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs
      fi
   fi
done

printprocesslog "INFO finished $0"

