Changeset 19433


Ignore:
Timestamp:
Feb 1, 2019, 4:27:16 PM (2 weeks ago)
Author:
dorner
Message:
check also for alien jobs and addapted time ranges to be checked
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/DataCheck/Monitoring/CheckStatus.sh

    r14825 r19433  
    77printprocesslog "INFO starting $0"
    88
    9 tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star )
     9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilledStatus )
     10
     11printjobs()
     12{
     13   echo "The following "$2" jobs are "$1": "
     14   sendquery
     15   echo ""
     16   echo "to view: "$query
     17   echo ""
     18   updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where
     19   echo "to reset: "$updquery
     20}
    1021
    1122for step in ${tables[@]}
     
    3344   printprocesslog "INFO checking "$step" for failed jobs."
    3445   where=" WHERE NOT ISNULL(fReturnCode) "
    35    where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"             
     46   # check only last 24h
     47   #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"             
     48   # last 3 days
     49   where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)"             
    3650   query="SELECT Count(*) FROM "$step"Status "$where
    3751   num=`sendquery`
     
    4559         sel=$selstart", '(', fReturnCode, ')')"
    4660         query="SELECT "$sel" FROM "$step"Status "$where
    47          sendquery | mail -s 'found failed jobs in '$step $erradrs
     61         printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs
    4862         printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
    4963      fi
     
    5569   printprocesslog "INFO checking "$step" for crashed jobs."
    5670   where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
    57    where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
    58    where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
     71   # all crashed jobs
     72   where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)"
     73   where=$where" AND fStartTime > '1971-01-01 01:01:01' "
     74   # only the crashed jobs of the last 24h
     75   #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
     76   #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
    5977   query="SELECT Count(*) FROM "$step"Status "$where
    6078   num=`sendquery`
     
    6886         sel=$selstart", '(', fStartTime, ')')"
    6987         query="SELECT "$sel" FROM "$step"Status "$where
    70          sendquery | mail -s 'found crashed jobs in '$step $erradrs
     88         printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs
    7189         printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
     90      fi
     91   fi
     92   
     93   # check for jobs with strange status
     94   #  i.e. jobs with startime NULL and valid stoptime
     95   #  (probably they have been reset while a job was running)
     96   printprocesslog "INFO checking "$step" for alien jobs."
     97   where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) "
     98   query="SELECT Count(*) FROM "$step"Status "$where
     99   num=`sendquery`
     100   if [ "$num" == "" ]
     101   then
     102      printprocesslog "WARN could not get number of alien jobs from the DB."
     103   else
     104      if [ $num -gt 0 ]
     105      then
     106         printprocesslog "WARN found in "$step" "$num" alien jobs."
     107         sel=$selstart")"
     108         query="SELECT "$sel" FROM "$step"Status "$where
     109         printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs
     110         printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs
    72111      fi
    73112   fi
Note: See TracChangeset for help on using the changeset viewer.