1 | #!/bin/bash
|
---|
2 | #
|
---|
3 | # This script checks the status tables in the DB for failed and crashed jobs
|
---|
4 | #
|
---|
5 |
|
---|
6 | source `dirname $0`/../Sourcefile.sh
|
---|
7 | printprocesslog "INFO starting $0"
|
---|
8 |
|
---|
9 | tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star )
|
---|
10 |
|
---|
11 | for step in ${tables[@]}
|
---|
12 | do
|
---|
13 | getstepinfo
|
---|
14 | counter=0
|
---|
15 | for prim in ${prims[@]}
|
---|
16 | do
|
---|
17 | if [ $counter -eq 0 ]
|
---|
18 | then
|
---|
19 | selstart=" CONCAT("$prim
|
---|
20 | fi
|
---|
21 | if [ $counter -gt 0 ]
|
---|
22 | then
|
---|
23 | selstart=$selstart" '_', LPAD("$prim", 3, 0)"
|
---|
24 | fi
|
---|
25 | counter=`echo $counter + 1 | bc -l`
|
---|
26 | #if [ $counter -eq ${#prims[@]} ]
|
---|
27 | #then
|
---|
28 | # sel=$sel")"
|
---|
29 | #fi
|
---|
30 | done
|
---|
31 |
|
---|
32 | # check table for failed jobs
|
---|
33 | printprocesslog "INFO checking "$step" for failed jobs."
|
---|
34 | where=" WHERE NOT ISNULL(fReturnCode) "
|
---|
35 | where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
|
---|
36 | query="SELECT Count(*) FROM "$step"Status "$where
|
---|
37 | num=`sendquery`
|
---|
38 | if [ "$num" == "" ]
|
---|
39 | then
|
---|
40 | printprocesslog "WARN could not get number of failed jobs from the DB."
|
---|
41 | else
|
---|
42 | if [ $num -gt 0 ]
|
---|
43 | then
|
---|
44 | printprocesslog "WARN found in "$step" "$num" rows with errors."
|
---|
45 | sel=$selstart", '(', fReturnCode, ')')"
|
---|
46 | query="SELECT "$sel" FROM "$step"Status "$where
|
---|
47 | sendquery | mail -s 'found failed jobs in '$step $erradrs
|
---|
48 | printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
|
---|
49 | fi
|
---|
50 | fi
|
---|
51 |
|
---|
52 | # check table for crashed jobs
|
---|
53 | # i.e. jobs which are running > 2 hours
|
---|
54 | # check the last 27 hours
|
---|
55 | printprocesslog "INFO checking "$step" for crashed jobs."
|
---|
56 | where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
|
---|
57 | where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
|
---|
58 | where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
|
---|
59 | query="SELECT Count(*) FROM "$step"Status "$where
|
---|
60 | num=`sendquery`
|
---|
61 | if [ "$num" == "" ]
|
---|
62 | then
|
---|
63 | printprocesslog "WARN could not get number of crashed jobs from the DB."
|
---|
64 | else
|
---|
65 | if [ $num -gt 0 ]
|
---|
66 | then
|
---|
67 | printprocesslog "WARN found in "$step" "$num" crashed jobs."
|
---|
68 | sel=$selstart", '(', fStartTime, ')')"
|
---|
69 | query="SELECT "$sel" FROM "$step"Status "$where
|
---|
70 | sendquery | mail -s 'found crashed jobs in '$step $erradrs
|
---|
71 | printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
|
---|
72 | fi
|
---|
73 | fi
|
---|
74 | done
|
---|
75 |
|
---|
76 | printprocesslog "INFO finished $0"
|
---|
77 |
|
---|