source: trunk/DataCheck/Monitoring/CheckStatus.sh@ 13066

Last change on this file since 13066 was 13064, checked in by Daniela Dorner, 13 years ago
added (script to check database for failed and crashed jobs)
  • Property svn:executable set to *
File size: 2.0 KB
Line 
1#!/bin/bash
2#
3# This script checks the status tables in the DB for failed and crashed jobs
4#
5
6source `dirname $0`/../Sourcefile.sh
7printprocesslog "INFO starting $0"
8
9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC Callisto Star )
10
11for step in ${tables[@]}
12do
13 getstepinfo
14 counter=0
15 for prim in ${prims[@]}
16 do
17 if [ $counter -eq 0 ]
18 then
19 selstart=" CONCAT("$prim
20 fi
21 if [ $counter -gt 0 ]
22 then
23 selstart=$selstart" '_', LPAD("$prim", 3, 0)"
24 fi
25 counter=`echo $counter + 1 | bc -l`
26 #if [ $counter -eq ${#prims[@]} ]
27 #then
28 # sel=$sel")"
29 #fi
30 done
31
32 # check table for failed jobs
33 where=" WHERE NOT ISNULL(fReturnCode) "
34 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
35 query="SELECT Count(*) FROM "$step"Status "$where
36 num=`sendquery`
37 printprocesslog "WARN found in "$step" "$num" rows with errors."
38 if [ $num -gt 0 ]
39 then
40 sel=$selstart", '(', fReturnCode, ')')"
41 query="SELECT "$sel" FROM "$step"Status "$where
42 sendquery | mail -s 'found failed jobs in '$step $erradrs
43 printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
44 fi
45
46 # check table for crashed jobs
47 # i.e. jobs which are running > 2 hours
48 # check the last 27 hours
49 where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
50 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
51 where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
52 query="SELECT Count(*) FROM "$step"Status "$where
53 num=`sendquery`
54 printprocesslog "WARN found in "$step" "$num" crashed jobs."
55 if [ $num -gt 0 ]
56 then
57 sel=$selstart", '(', fStartTime, ')')"
58 query="SELECT "$sel" FROM "$step"Status "$where
59 sendquery | mail -s 'found crashed jobs in '$step $erradrs
60 printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
61 fi
62done
63
64printprocesslog "INFO finished $0"
65
Note: See TracBrowser for help on using the repository browser.