source: trunk/DataCheck/Monitoring/CheckStatus.sh@ 13314

Last change on this file since 13314 was 13094, checked in by Daniela Dorner, 13 years ago
improved logging
  • Property svn:executable set to *
File size: 2.1 KB
Line 
1#!/bin/bash
2#
3# This script checks the status tables in the DB for failed and crashed jobs
4#
5
6source `dirname $0`/../Sourcefile.sh
7printprocesslog "INFO starting $0"
8
9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC Callisto Star )
10
11for step in ${tables[@]}
12do
13 getstepinfo
14 counter=0
15 for prim in ${prims[@]}
16 do
17 if [ $counter -eq 0 ]
18 then
19 selstart=" CONCAT("$prim
20 fi
21 if [ $counter -gt 0 ]
22 then
23 selstart=$selstart" '_', LPAD("$prim", 3, 0)"
24 fi
25 counter=`echo $counter + 1 | bc -l`
26 #if [ $counter -eq ${#prims[@]} ]
27 #then
28 # sel=$sel")"
29 #fi
30 done
31
32 # check table for failed jobs
33 printprocesslog "INFO checking "$step" for failed jobs."
34 where=" WHERE NOT ISNULL(fReturnCode) "
35 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
36 query="SELECT Count(*) FROM "$step"Status "$where
37 num=`sendquery`
38 if [ $num -gt 0 ]
39 then
40 printprocesslog "WARN found in "$step" "$num" rows with errors."
41 sel=$selstart", '(', fReturnCode, ')')"
42 query="SELECT "$sel" FROM "$step"Status "$where
43 sendquery | mail -s 'found failed jobs in '$step $erradrs
44 printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
45 fi
46
47 # check table for crashed jobs
48 # i.e. jobs which are running > 2 hours
49 # check the last 27 hours
50 printprocesslog "INFO checking "$step" for crashed jobs."
51 where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
52 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
53 where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
54 query="SELECT Count(*) FROM "$step"Status "$where
55 num=`sendquery`
56 if [ $num -gt 0 ]
57 then
58 printprocesslog "WARN found in "$step" "$num" crashed jobs."
59 sel=$selstart", '(', fStartTime, ')')"
60 query="SELECT "$sel" FROM "$step"Status "$where
61 sendquery | mail -s 'found crashed jobs in '$step $erradrs
62 printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
63 fi
64done
65
66printprocesslog "INFO finished $0"
67
Note: See TracBrowser for help on using the repository browser.