| 1 | #!/bin/bash
|
|---|
| 2 | #
|
|---|
| 3 | # This script checks the status tables in the DB for failed and crashed jobs
|
|---|
| 4 | #
|
|---|
| 5 |
|
|---|
| 6 | source `dirname $0`/../Sourcefile.sh
|
|---|
| 7 | printprocesslog "INFO starting $0"
|
|---|
| 8 |
|
|---|
| 9 | tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilled CalcSource AnalysisResultsCutsLC )
|
|---|
| 10 |
|
|---|
| 11 | printjobs()
|
|---|
| 12 | {
|
|---|
| 13 | echo "The following "$2" jobs are "$1": "
|
|---|
| 14 | sendquery
|
|---|
| 15 | echo ""
|
|---|
| 16 | echo "to view: "$query
|
|---|
| 17 | echo ""
|
|---|
| 18 | updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where
|
|---|
| 19 | echo "to reset: "$updquery
|
|---|
| 20 | }
|
|---|
| 21 |
|
|---|
| 22 | for step in ${tables[@]}
|
|---|
| 23 | do
|
|---|
| 24 | getstepinfo
|
|---|
| 25 | counter=0
|
|---|
| 26 | for prim in ${prims[@]}
|
|---|
| 27 | do
|
|---|
| 28 | if [ $counter -eq 0 ]
|
|---|
| 29 | then
|
|---|
| 30 | selstart=" CONCAT("$prim
|
|---|
| 31 | fi
|
|---|
| 32 | if [ $counter -gt 0 ]
|
|---|
| 33 | then
|
|---|
| 34 | selstart=$selstart" '_', LPAD("$prim", 3, 0)"
|
|---|
| 35 | fi
|
|---|
| 36 | counter=`echo $counter + 1 | bc -l`
|
|---|
| 37 | #if [ $counter -eq ${#prims[@]} ]
|
|---|
| 38 | #then
|
|---|
| 39 | # sel=$sel")"
|
|---|
| 40 | #fi
|
|---|
| 41 | done
|
|---|
| 42 |
|
|---|
| 43 | # check table for failed jobs
|
|---|
| 44 | printprocesslog "INFO checking "$step" for failed jobs."
|
|---|
| 45 | where=" WHERE NOT ISNULL(fReturnCode) "
|
|---|
| 46 | # check only last 24h
|
|---|
| 47 | #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
|
|---|
| 48 | # last 3 days
|
|---|
| 49 | where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)"
|
|---|
| 50 | query="SELECT Count(*) FROM "$step"Status "$where
|
|---|
| 51 | num=`sendquery`
|
|---|
| 52 | if [ "$num" == "" ]
|
|---|
| 53 | then
|
|---|
| 54 | printprocesslog "WARN could not get number of failed jobs from the DB."
|
|---|
| 55 | else
|
|---|
| 56 | if [ $num -gt 0 ]
|
|---|
| 57 | then
|
|---|
| 58 | printprocesslog "WARN found in "$step" "$num" rows with errors."
|
|---|
| 59 | sel=$selstart", '(', fReturnCode, ')')"
|
|---|
| 60 | query="SELECT "$sel" FROM "$step"Status "$where
|
|---|
| 61 | printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs
|
|---|
| 62 | printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
|
|---|
| 63 | fi
|
|---|
| 64 | fi
|
|---|
| 65 |
|
|---|
| 66 | # check table for crashed jobs
|
|---|
| 67 | # i.e. jobs which are running > 2 hours
|
|---|
| 68 | # check the last 27 hours
|
|---|
| 69 | printprocesslog "INFO checking "$step" for crashed jobs."
|
|---|
| 70 | where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
|
|---|
| 71 | # all crashed jobs
|
|---|
| 72 | where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)"
|
|---|
| 73 | where=$where" AND fStartTime > '1971-01-01 01:01:01' "
|
|---|
| 74 | # only the crashed jobs of the last 24h
|
|---|
| 75 | #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
|
|---|
| 76 | #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
|
|---|
| 77 | query="SELECT Count(*) FROM "$step"Status "$where
|
|---|
| 78 | num=`sendquery`
|
|---|
| 79 | if [ "$num" == "" ]
|
|---|
| 80 | then
|
|---|
| 81 | printprocesslog "WARN could not get number of crashed jobs from the DB."
|
|---|
| 82 | else
|
|---|
| 83 | if [ $num -gt 0 ]
|
|---|
| 84 | then
|
|---|
| 85 | printprocesslog "WARN found in "$step" "$num" crashed jobs."
|
|---|
| 86 | sel=$selstart", '(', fStartTime, ')')"
|
|---|
| 87 | query="SELECT "$sel" FROM "$step"Status "$where
|
|---|
| 88 | printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs
|
|---|
| 89 | printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
|
|---|
| 90 | fi
|
|---|
| 91 | fi
|
|---|
| 92 |
|
|---|
| 93 | # check for jobs with strange status
|
|---|
| 94 | # i.e. jobs with startime NULL and valid stoptime
|
|---|
| 95 | # (probably they have been reset while a job was running)
|
|---|
| 96 | printprocesslog "INFO checking "$step" for alien jobs."
|
|---|
| 97 | where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) "
|
|---|
| 98 | query="SELECT Count(*) FROM "$step"Status "$where
|
|---|
| 99 | num=`sendquery`
|
|---|
| 100 | if [ "$num" == "" ]
|
|---|
| 101 | then
|
|---|
| 102 | printprocesslog "WARN could not get number of alien jobs from the DB."
|
|---|
| 103 | else
|
|---|
| 104 | if [ $num -gt 0 ]
|
|---|
| 105 | then
|
|---|
| 106 | printprocesslog "WARN found in "$step" "$num" alien jobs."
|
|---|
| 107 | sel=$selstart")"
|
|---|
| 108 | query="SELECT "$sel" FROM "$step"Status "$where
|
|---|
| 109 | printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs
|
|---|
| 110 | printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs
|
|---|
| 111 | fi
|
|---|
| 112 | fi
|
|---|
| 113 | done
|
|---|
| 114 |
|
|---|
| 115 | printprocesslog "INFO finished $0"
|
|---|
| 116 |
|
|---|