source: trunk/DataCheck/Monitoring/CheckStatus.sh @ 19493

Last change on this file since 19493 was 19493, checked in by dorner, 6 weeks ago
added filling of results
  • Property svn:executable set to *
File size: 3.9 KB
Line 
1#!/bin/bash
2#
3# This script checks the status tables in the DB for failed and crashed jobs
4#
5
6source `dirname $0`/../Sourcefile.sh
7printprocesslog "INFO starting $0"
8
9tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilled CalcSource AnalysisResultsCutsLC )
10
11printjobs()
12{
13   echo "The following "$2" jobs are "$1": "
14   sendquery
15   echo ""
16   echo "to view: "$query
17   echo ""
18   updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where
19   echo "to reset: "$updquery
20}
21
22for step in ${tables[@]}
23do
24   getstepinfo
25   counter=0
26   for prim in ${prims[@]}
27   do
28      if [ $counter -eq 0 ]
29      then
30         selstart=" CONCAT("$prim
31      fi
32      if [ $counter -gt 0 ]
33      then
34         selstart=$selstart" '_', LPAD("$prim", 3, 0)"
35      fi
36      counter=`echo $counter + 1 | bc -l`
37      #if [ $counter -eq ${#prims[@]} ]
38      #then
39      #   sel=$sel")"
40      #fi
41   done
42   
43   # check table for failed jobs
44   printprocesslog "INFO checking "$step" for failed jobs."
45   where=" WHERE NOT ISNULL(fReturnCode) "
46   # check only last 24h
47   #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)"             
48   # last 3 days
49   where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)"             
50   query="SELECT Count(*) FROM "$step"Status "$where
51   num=`sendquery`
52   if [ "$num" == "" ]
53   then 
54      printprocesslog "WARN could not get number of failed jobs from the DB."
55   else
56      if [ $num -gt 0 ]
57      then
58         printprocesslog "WARN found in "$step" "$num" rows with errors."
59         sel=$selstart", '(', fReturnCode, ')')"
60         query="SELECT "$sel" FROM "$step"Status "$where
61         printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs
62         printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs
63      fi
64   fi
65
66   # check table for crashed jobs
67   #   i.e. jobs which are running > 2 hours
68   # check the last 27 hours
69   printprocesslog "INFO checking "$step" for crashed jobs."
70   where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) "
71   # all crashed jobs
72   where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)"
73   where=$where" AND fStartTime > '1971-01-01 01:01:01' "
74   # only the crashed jobs of the last 24h
75   #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)"
76   #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)"
77   query="SELECT Count(*) FROM "$step"Status "$where
78   num=`sendquery`
79   if [ "$num" == "" ]
80   then 
81      printprocesslog "WARN could not get number of crashed jobs from the DB."
82   else
83      if [ $num -gt 0 ]
84      then
85         printprocesslog "WARN found in "$step" "$num" crashed jobs."
86         sel=$selstart", '(', fStartTime, ')')"
87         query="SELECT "$sel" FROM "$step"Status "$where
88         printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs
89         printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs
90      fi
91   fi
92   
93   # check for jobs with strange status
94   #  i.e. jobs with startime NULL and valid stoptime
95   #  (probably they have been reset while a job was running)
96   printprocesslog "INFO checking "$step" for alien jobs."
97   where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) "
98   query="SELECT Count(*) FROM "$step"Status "$where
99   num=`sendquery`
100   if [ "$num" == "" ]
101   then 
102      printprocesslog "WARN could not get number of alien jobs from the DB."
103   else
104      if [ $num -gt 0 ]
105      then
106         printprocesslog "WARN found in "$step" "$num" alien jobs."
107         sel=$selstart")"
108         query="SELECT "$sel" FROM "$step"Status "$where
109         printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs
110         printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs
111      fi
112   fi
113done
114
115printprocesslog "INFO finished $0"
116
Note: See TracBrowser for help on using the repository browser.