source: branches/trigger_burst_research/Monitoring/CheckStatus.sh@ 19536

Last change on this file since 19536 was 14825, checked in by Daniela Dorner, 12 years ago
added DriveFileAvailISDC to the tables to be checked as this is used for analysis/processing
  • Property svn:executable set to *
File size: 2.4 KB
Line 
1#!/bin/bash
2#
3# This script checks the status tables in the DB for failed and crashed jobs
4#
5
6source `dirname $0`/../Sourcefile.sh
7printprocesslog "INFO starting $0"
8
9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star )
10
11for step in ${tables[@]}
12do
13 getstepinfo
14 counter=0
15 for prim in ${prims[@]}
16 do
17 if [ $counter -eq 0 ]
18 then
19 selstart=" CONCAT("$prim
20 fi
21 if [ $counter -gt 0 ]
22 then
23 selstart=$selstart" '_', LPAD("$prim", 3, 0)"
24 fi
25 counter=`echo $counter + 1 | bc -l`
26 #if [ $counter -eq ${#prims[@]} ]
27 #then
28 # sel=$sel")"
29 #fi
30 done
31
32 # check table for failed jobs
33 printprocesslog "INFO checking "$step" for failed jobs."
34 where=" WHERE NOT ISNULL(fReturnCode) "
35 where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"
36 query="SELECT Count(*) FROM "$step"Status "$where
37 num=`sendquery`
38 if [ "$num" == "" ]
39 then
40 printprocesslog "WARN could not get number of failed jobs from the DB."
41 else
42 if [ $num -gt 0 ]
43 then
44 printprocesslog "WARN found in "$step" "$num" rows with errors."
45 sel=$selstart", '(', fReturnCode, ')')"
46 query="SELECT "$sel" FROM "$step"Status "$where
47 sendquery | mail -s 'found failed jobs in '$step $erradrs
48 printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
49 fi
50 fi
51
52 # check table for crashed jobs
53 # i.e. jobs which are running > 2 hours
54 # check the last 27 hours
55 printprocesslog "INFO checking "$step" for crashed jobs."
56 where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
57 where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
58 where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
59 query="SELECT Count(*) FROM "$step"Status "$where
60 num=`sendquery`
61 if [ "$num" == "" ]
62 then
63 printprocesslog "WARN could not get number of crashed jobs from the DB."
64 else
65 if [ $num -gt 0 ]
66 then
67 printprocesslog "WARN found in "$step" "$num" crashed jobs."
68 sel=$selstart", '(', fStartTime, ')')"
69 query="SELECT "$sel" FROM "$step"Status "$where
70 sendquery | mail -s 'found crashed jobs in '$step $erradrs
71 printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
72 fi
73 fi
74done
75
76printprocesslog "INFO finished $0"
77
Note: See TracBrowser for help on using the repository browser.