#!/bin/bash # # This script checks the status tables in the DB for failed and crashed jobs # source `dirname $0`/../Sourcefile.sh printprocesslog "INFO starting $0" tables=( RawFileAvailISDC RawFileAvailWue RawFileRsyncedISDC SequenceFileAvailISDC AuxFilesAvailISDC DriveFileAvailISDC Callisto Star StarEventsFilled CalcSource AnalysisResultsCutsLC ) printjobs() { echo "The following "$2" jobs are "$1": " sendquery echo "" echo "to view: "$query echo "" updquery="UPDATE "$step"Status SET fStartTime=NULL, fStopTime=NULL, fReturnCode=NULL, fAvailable=NULL, fProcessingSiteKEY=NULL "$where echo "to reset: "$updquery } for step in ${tables[@]} do getstepinfo counter=0 for prim in ${prims[@]} do if [ $counter -eq 0 ] then selstart=" CONCAT("$prim fi if [ $counter -gt 0 ] then selstart=$selstart" '_', LPAD("$prim", 3, 0)" fi counter=`echo $counter + 1 | bc -l` #if [ $counter -eq ${#prims[@]} ] #then # sel=$sel")" #fi done # check table for failed jobs printprocesslog "INFO checking "$step" for failed jobs." where=" WHERE NOT ISNULL(fReturnCode) " # check only last 24h #where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -25 HOUR)" # last 3 days where=$where" AND fStopTime > ADDDATE(NOW(), INTERVAL -72 HOUR)" query="SELECT Count(*) FROM "$step"Status "$where num=`sendquery` if [ "$num" == "" ] then printprocesslog "WARN could not get number of failed jobs from the DB." else if [ $num -gt 0 ] then printprocesslog "WARN found in "$step" "$num" rows with errors." sel=$selstart", '(', fReturnCode, ')')" query="SELECT "$sel" FROM "$step"Status "$where printjobs "failed" $num | mail -s 'found '$num' failed jobs in '$step $erradrs printprocesslog "INFO sent mail about failed jobs in "$step" to "$erradrs fi fi # check table for crashed jobs # i.e. jobs which are running > 2 hours # check the last 27 hours printprocesslog "INFO checking "$step" for crashed jobs." where=" WHERE NOT ISNULL(fStartTime) AND ISNULL(fStopTime) " # all crashed jobs where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -5 HOUR)" where=$where" AND fStartTime > '1971-01-01 01:01:01' " # only the crashed jobs of the last 24h #where=$where" AND fStartTime < ADDDATE(NOW(), INTERVAL -2 HOUR)" #where=$where" AND fStartTime > ADDDATE(NOW(), INTERVAL -27 HOUR)" query="SELECT Count(*) FROM "$step"Status "$where num=`sendquery` if [ "$num" == "" ] then printprocesslog "WARN could not get number of crashed jobs from the DB." else if [ $num -gt 0 ] then printprocesslog "WARN found in "$step" "$num" crashed jobs." sel=$selstart", '(', fStartTime, ')')" query="SELECT "$sel" FROM "$step"Status "$where printjobs "crashed" $num | mail -s 'found '$num' crashed jobs in '$step $erradrs printprocesslog "INFO sent mail about crashed jobs in "$step" to "$erradrs fi fi # check for jobs with strange status # i.e. jobs with startime NULL and valid stoptime # (probably they have been reset while a job was running) printprocesslog "INFO checking "$step" for alien jobs." where=" WHERE ISNULL(fStartTime) AND NOT ISNULL(fStopTime) " query="SELECT Count(*) FROM "$step"Status "$where num=`sendquery` if [ "$num" == "" ] then printprocesslog "WARN could not get number of alien jobs from the DB." else if [ $num -gt 0 ] then printprocesslog "WARN found in "$step" "$num" alien jobs." sel=$selstart")" query="SELECT "$sel" FROM "$step"Status "$where printjobs "alien" $num | mail -s 'found '$num' alien jobs in '$step $erradrs printprocesslog "INFO sent mail about alien jobs in "$step" to "$erradrs fi fi done printprocesslog "INFO finished $0"