source: trunk/DataCheck/Monitoring/CheckStatus.sh@ 14759

Last change on this file since 14759 was 13579, checked in by Daniela Dorner, 13 years ago
added check for empty return of mysql
  • Property svn:executable set to *
File size: 2.4 KB
Line 
1#!/bin/bash
2#
3# This script checks the status tables in the DB for failed and crashed jobs
4#
5
6source `dirname $0`/../Sourcefile.sh
7printprocesslog "INFO starting $0"
8
9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC Callisto Star )
10
11for step in ${tables[@]}
12do
13 getstepinfo
14 counter=0
15 for prim in ${prims[@]}
16 do
17 if [ $counter -eq 0 ]
18 then
19 selstart=" CONCAT("$prim
20 fi
21 if [ $counter -gt 0 ]
22 then
23 selstart=$selstart" '_', LPAD("$prim", 3, 0)"
24 fi
25 counter=`echo $counter + 1 | bc -l`
26 #if [ $counter -eq ${#prims[@]} ]
27 #then
28 # sel=$sel")"
29 #fi
30 done
31
32 # check table for failed jobs
33 printprocesslog "INFO checking "$step" for failed jobs."
34 where=" WHERE NOT ISNULL(fReturnCode) "
35 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
36 query="SELECT Count(*) FROM "$step"Status "$where
37 num=`sendquery`
38 if [ "$num" == "" ]
39 then
40 printprocesslog "WARN could not get number of failed jobs from the DB."
41 else
42 if [ $num -gt 0 ]
43 then
44 printprocesslog "WARN found in "$step" "$num" rows with errors."
45 sel=$selstart", '(', fReturnCode, ')')"
46 query="SELECT "$sel" FROM "$step"Status "$where
47 sendquery | mail -s 'found failed jobs in '$step $erradrs
48 printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
49 fi
50 fi
51
52 # check table for crashed jobs
53 # i.e. jobs which are running > 2 hours
54 # check the last 27 hours
55 printprocesslog "INFO checking "$step" for crashed jobs."
56 where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
57 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
58 where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
59 query="SELECT Count(*) FROM "$step"Status "$where
60 num=`sendquery`
61 if [ "$num" == "" ]
62 then
63 printprocesslog "WARN could not get number of crashed jobs from the DB."
64 else
65 if [ $num -gt 0 ]
66 then
67 printprocesslog "WARN found in "$step" "$num" crashed jobs."
68 sel=$selstart", '(', fStartTime, ')')"
69 query="SELECT "$sel" FROM "$step"Status "$where
70 sendquery | mail -s 'found crashed jobs in '$step $erradrs
71 printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
72 fi
73 fi
74done
75
76printprocesslog "INFO finished $0"
77
Note: See TracBrowser for help on using the repository browser.