source: trunk/DataCheck/Monitoring/CheckStatus.sh@ 19503

Last change on this file since 19503 was 19493, checked in by Daniela Dorner, 6 years ago
added filling of results
  • Property svn:executable set to *
File size: 3.9 KB
Line 
1#!/bin/bash
2#
3# This script checks the status tables in the DB for failed and crashed jobs
4#
5
6source `dirname $0`/../Sourcefile.sh
7printprocesslog "INFO starting $0"
8
9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilled CalcSource AnalysisResultsCutsLC )
10
11printjobs()
12{
13 echo "The following "$2" jobs are "$1": "
14 sendquery
15 echo ""
16 echo "to view: "$query
17 echo ""
18 updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where
19 echo "to reset: "$updquery
20}
21
22for step in ${tables[@]}
23do
24 getstepinfo
25 counter=0
26 for prim in ${prims[@]}
27 do
28 if [ $counter -eq 0 ]
29 then
30 selstart=" CONCAT("$prim
31 fi
32 if [ $counter -gt 0 ]
33 then
34 selstart=$selstart" '_', LPAD("$prim", 3, 0)"
35 fi
36 counter=`echo $counter + 1 | bc -l`
37 #if [ $counter -eq ${#prims[@]} ]
38 #then
39 # sel=$sel")"
40 #fi
41 done
42
43 # check table for failed jobs
44 printprocesslog "INFO checking "$step" for failed jobs."
45 where=" WHERE NOT ISNULL(fReturnCode) "
46 # check only last 24h
47 #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
48 # last 3 days
49 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)"
50 query="SELECT Count(*) FROM "$step"Status "$where
51 num=`sendquery`
52 if [ "$num" == "" ]
53 then
54 printprocesslog "WARN could not get number of failed jobs from the DB."
55 else
56 if [ $num -gt 0 ]
57 then
58 printprocesslog "WARN found in "$step" "$num" rows with errors."
59 sel=$selstart", '(', fReturnCode, ')')"
60 query="SELECT "$sel" FROM "$step"Status "$where
61 printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs
62 printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
63 fi
64 fi
65
66 # check table for crashed jobs
67 # i.e. jobs which are running > 2 hours
68 # check the last 27 hours
69 printprocesslog "INFO checking "$step" for crashed jobs."
70 where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
71 # all crashed jobs
72 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)"
73 where=$where" AND fStartTime > '1971-01-01 01:01:01' "
74 # only the crashed jobs of the last 24h
75 #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
76 #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
77 query="SELECT Count(*) FROM "$step"Status "$where
78 num=`sendquery`
79 if [ "$num" == "" ]
80 then
81 printprocesslog "WARN could not get number of crashed jobs from the DB."
82 else
83 if [ $num -gt 0 ]
84 then
85 printprocesslog "WARN found in "$step" "$num" crashed jobs."
86 sel=$selstart", '(', fStartTime, ')')"
87 query="SELECT "$sel" FROM "$step"Status "$where
88 printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs
89 printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
90 fi
91 fi
92
93 # check for jobs with strange status
94 # i.e. jobs with startime NULL and valid stoptime
95 # (probably they have been reset while a job was running)
96 printprocesslog "INFO checking "$step" for alien jobs."
97 where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) "
98 query="SELECT Count(*) FROM "$step"Status "$where
99 num=`sendquery`
100 if [ "$num" == "" ]
101 then
102 printprocesslog "WARN could not get number of alien jobs from the DB."
103 else
104 if [ $num -gt 0 ]
105 then
106 printprocesslog "WARN found in "$step" "$num" alien jobs."
107 sel=$selstart")"
108 query="SELECT "$sel" FROM "$step"Status "$where
109 printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs
110 printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs
111 fi
112 fi
113done
114
115printprocesslog "INFO finished $0"
116
Note: See TracBrowser for help on using the repository browser.