Changeset 19433 for trunk/DataCheck/Monitoring
- Timestamp:
- 02/01/19 16:27:16 (6 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/DataCheck/Monitoring/CheckStatus.sh
r14825 r19433 7 7 printprocesslog "INFO starting $0" 8 8 9 tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star ) 9 tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilledStatus ) 10 11 printjobs() 12 { 13 echo "The following "$2" jobs are "$1": " 14 sendquery 15 echo "" 16 echo "to view: "$query 17 echo "" 18 updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where 19 echo "to reset: "$updquery 20 } 10 21 11 22 for step in ${tables[@]} … … 33 44 printprocesslog "INFO checking "$step" for failed jobs." 34 45 where=" WHERE NOT ISNULL(fReturnCode) " 35 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)" 46 # check only last 24h 47 #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)" 48 # last 3 days 49 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)" 36 50 query="SELECT Count(*) FROM "$step"Status "$where 37 51 num=`sendquery` … … 45 59 sel=$selstart", '(', fReturnCode, ')')" 46 60 query="SELECT "$sel" FROM "$step"Status "$where 47 sendquery | mail -s 'foundfailed jobs in '$step $erradrs61 printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs 48 62 printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs 49 63 fi … … 55 69 printprocesslog "INFO checking "$step" for crashed jobs." 56 70 where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) " 57 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)" 58 where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)" 71 # all crashed jobs 72 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)" 73 where=$where" AND fStartTime > '1971-01-01 01:01:01' " 74 # only the crashed jobs of the last 24h 75 #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)" 76 #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)" 59 77 query="SELECT Count(*) FROM "$step"Status "$where 60 78 num=`sendquery` … … 68 86 sel=$selstart", '(', fStartTime, ')')" 69 87 query="SELECT "$sel" FROM "$step"Status "$where 70 sendquery | mail -s 'foundcrashed jobs in '$step $erradrs88 printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs 71 89 printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs 90 fi 91 fi 92 93 # check for jobs with strange status 94 # i.e. jobs with startime NULL and valid stoptime 95 # (probably they have been reset while a job was running) 96 printprocesslog "INFO checking "$step" for alien jobs." 97 where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) " 98 query="SELECT Count(*) FROM "$step"Status "$where 99 num=`sendquery` 100 if [ "$num" == "" ] 101 then 102 printprocesslog "WARN could not get number of alien jobs from the DB." 103 else 104 if [ $num -gt 0 ] 105 then 106 printprocesslog "WARN found in "$step" "$num" alien jobs." 107 sel=$selstart")" 108 query="SELECT "$sel" FROM "$step"Status "$where 109 printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs 110 printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs 72 111 fi 73 112 fi
Note:
See TracChangeset
for help on using the changeset viewer.