Changeset 227


Ignore:
Timestamp:
Jun 18, 2010, 1:25:48 PM (10 years ago)
Author:
ogrimm
Message:
Extended capabilities of Alarm server
Location:
Evidence
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • Evidence/Alarm.cc

    r216 r227  
    44
    55  - Checks periodically if all required servers are up
    6     (later it should try to start them if not)
    7   - Listens to the 'Status' service of each server.
    8   - A text describing the state of all servers is published as DIM service.
    9     The states are described in StateString[].
     6  - Listens to the 'Message' service of each server and generates new service for
     7    each observed server indicating the maximum Severity in the past.
     8  - Maximum severity may be reset by a command 'Alarm/ResetAlarm' for a server.
     9  - A text describing the current state of all servers is published as DIM service.
     10    The states are described in LevelStr[].
    1011  - A master alarm (indicating most severe of individual alarms) is published.
    1112   
    12   Oliver Grimm, January 2010
     13  A mutex is used because UpdateAlarmSummary() may be called from DIM handler thread and
     14  from main thread.
     15
     16  Oliver Grimm, June 2010
    1317
    1418\********************************************************************/
     
    1721#include "Evidence.h"
    1822
    19 #define SUMMARYSIZE 10000       // Bytes for alarm summary text
    20 
    21 const char* StateString[] = {"OK", "WARN", "ERROR", "FATAL", "UNAVAILABLE"};
     23#include <sstream>
     24
     25using namespace std;
     26
     27const char* LevelStr[] = {"OK", "WARN", "ERROR", "FATAL", "UNAVAILABLE"};
    2228
    2329//
    2430// Data handling class
    2531//
    26 class AlarmHandler : public DimClient, public EvidenceServer {
    27    
    28     DimStampedInfo **StatusService;
     32class AlarmHandler: public DimClient, public EvidenceServer {
     33   
     34        DimCommand *Command;
     35        DimService *Summary, *Master;
     36        char *AlarmText;
     37        int MasterAlarm;
     38        pthread_mutex_t Mutex;
    2939
    3040    void infoHandler();
     41        void commandHandler();
    3142
    3243  public:
     
    3445    ~AlarmHandler();
    3546
    36         DimService *Summary, *Master;
    37        
    38         char *AlarmSummary;
    39         int MasterAlarm;
    40         int *State;
    41         bool *Warned;   
    42     char **Server;
    43     unsigned int NumServers;
    44     char *ServerList;
    45        
     47        struct Item {
     48          string Server;
     49          string Email;
     50          DimStampedInfo *Subscription;
     51          DimService *AlarmLevel;
     52          int WarnedLevel;
     53          int Level;
     54        };
     55        vector<struct Item> List;
     56
    4657        void UpdateAlarmSummary();
    4758};
     
    5061AlarmHandler::AlarmHandler(): EvidenceServer(SERVER_NAME) {
    5162
    52   AlarmSummary = new char [SUMMARYSIZE];
     63  struct Item N;
     64  static int InitLevel = -1; // static for DIM service below
     65
     66  // Initialise
    5367  MasterAlarm = 0;
     68  AlarmText = NULL;
     69 
     70  if (pthread_mutex_init(&Mutex, NULL) != 0) {
     71    Message(FATAL, "pthread_mutex_init failed");
     72  }
     73
     74  // Handling of servies will only start after start()
     75  autoStartOff();
    5476
    5577  // Create DIM services
    56   Summary = new DimService(SERVER_NAME"/Summary", AlarmSummary);
     78  Summary = new DimService(SERVER_NAME"/Summary", (char *) "not yet available");
    5779  Master = new DimService(SERVER_NAME"/MasterAlarm", MasterAlarm);
    5880
    59   // Copy original list of servers to observe
    60   char *ServerNames = GetConfig("servers");
    61   ServerList = new char [strlen(ServerNames)+1];
    62   strcpy(ServerList, ServerNames);
    63  
    64   // Extract DIM servers to observe
    65   Server = new char* [strlen(ServerNames)];
    66   NumServers = 0;
    67   char *NextToken = strtok(ServerNames, " \t");
    68   while (NextToken != NULL) {
    69     Server[NumServers++] = NextToken; // Subscribe with handler
    70     NextToken = strtok(NULL, " \t");     
    71   }
    72 
    73   // Subscribe with handler to 'Message' service of all servers
    74   StatusService = new DimStampedInfo* [NumServers];
    75   State = new int [NumServers];
    76   Warned = new bool [NumServers];
    77 
    78   for (int i=0; i<NumServers; i++) {
    79     char *Buffer = new char [strlen(Server[i])+10];
    80     strcpy(Buffer, Server[i]);
    81     strcat(Buffer, "/Message");
    82     StatusService[i] = new DimStampedInfo(Buffer, NO_LINK, this);
    83     delete[] Buffer;
    84        
    85         State[i] = 0;
    86   }
    87 }
     81  // Get DIM servers to observe
     82  char *Token = strtok(GetConfig("servers"), " \t");
     83  int Pos;
     84  while (Token != NULL) {
     85        // Extract server name and email
     86        N.Server = Token;
     87        Pos = N.Server.find(':');
     88        if (Pos > 0 && Pos < N.Server.size()-2) {
     89          N.Email = N.Server.substr(Pos+1, string::npos);
     90          N.Server = N.Server.substr(0, Pos);   
     91        }
     92        else N.Email = string();
     93
     94        // DIS_DNS has no Message service
     95        if (N.Server == "DIS_DNS") N.Subscription = NULL;
     96        else N.Subscription = new DimStampedInfo((N.Server+"/Message").c_str(), NO_LINK, this);
     97
     98        // Alarm service for server (reference to variable will be updated in UpdateAlarmSummary())
     99        N.WarnedLevel = 0;
     100        N.Level = -1;
     101        N.AlarmLevel = new DimService((N.Server+"/AlarmLevel").c_str(), InitLevel);
     102
     103        List.push_back(N);
     104    Token = strtok(NULL, " \t");     
     105  }
     106
     107  // Provide command to reset Level   
     108  Command = new DimCommand("Alarm/ResetAlarm", (char *) "C", this);
     109 
     110  // List set up, can start handling
     111  start(SERVER_NAME);
     112}
     113
    88114
    89115// Destructor
    90116AlarmHandler::~AlarmHandler() {
    91117
    92   for (int i=0; i<NumServers; i++) delete StatusService[i];
    93   delete[] StatusService;
     118  delete Command;
     119
     120  for (int i=0; i<List.size(); i++) {
     121    delete List[i].Subscription;
     122    delete List[i].AlarmLevel;
     123  }     
    94124  delete Master;
    95125  delete Summary;
    96   delete[] State;
    97   delete[] Server;
    98   delete[] ServerList;
    99   delete[] AlarmSummary;
    100 }
     126  delete[] AlarmText;
     127 
     128  pthread_mutex_destroy(&Mutex);
     129}
     130
    101131
    102132// Print messages of status changes to screen and update status string
     
    104134
    105135  // Identify status service
    106   for (int i=0; i<NumServers; i++) if (getInfo() == StatusService[i]) {
    107 
    108         // Ignore DIS_DNS (has no status service)
    109         if (strcmp(getInfo()->getName(),"DIS_DNS/Message") == 0) return;
    110        
    111         // Update State: unavailable or current severity of status 
    112         if (!ServiceOK(getInfo())) State[i] = 4;
    113         else {
    114           State[i] = *(getInfo()->getString()+getInfo()->getSize()-1);
    115 
    116           // Print message
    117           time_t RawTime = getInfo()->getTimestamp();
    118           struct tm *TM = localtime(&RawTime);
    119           printf("%s (%02d:%02d:%02d): %s\n", getInfo()->getName(), TM->tm_hour,
    120                 TM->tm_min, TM->tm_sec, getInfo()->getString());         
    121         }
    122         UpdateAlarmSummary();
    123   } 
     136  for (int i=0; i<List.size(); i++) if (getInfo() == List[i].Subscription) {
     137        // Update level: unavailable or current severity of status 
     138        if (!ServiceOK(getInfo())) List[i].Level = 4;
     139        else if (getInfo()->getInt() > List[i].Level) List[i].Level = getInfo()->getInt();
     140  }
     141
     142  UpdateAlarmSummary();
     143}
     144
     145
     146// Reset alarm level of given server
     147void AlarmHandler::commandHandler() {
     148
     149  DimCommand *C = getCommand();
     150
     151  // Check for valid command parameter
     152  if (C != Command) return;
     153  if (C->getSize() == 0) return;
     154  if (*((char *) C->getData() + C->getSize() - 1) != '\0') return;
     155 
     156  // Reset alarm level and publish/log action
     157  for (int i=0; i<List.size(); i++) if (List[i].Server == C->getString()) {
     158    Message(INFO, "Alarm level of server %s reset by %s (ID %d)", C->getString(), getClientName(), getClientId());
     159        List[i].Level = 0;
     160        List[i].WarnedLevel = 0;
     161  }
     162 
     163  UpdateAlarmSummary();
    124164}
    125165
     
    127167// Update alarm status summary
    128168void AlarmHandler::UpdateAlarmSummary() {
    129  
    130   int Offset = 0;
    131   MasterAlarm = 0;
    132    
    133   for (int i=0; i<NumServers; i++) {
    134     snprintf(AlarmSummary+Offset, SUMMARYSIZE-Offset, "%s: %s (%d)\n", Server[i], State[i]<=4 ? StateString[State[i]] : "unknown", State[i]);
    135         Offset += strlen(AlarmSummary+Offset);
    136         if (State[i] > MasterAlarm) MasterAlarm = State[i];
    137   }
    138   Summary->updateService();
     169
     170  ostringstream Buf;
     171  int Alarm, Ret; 
     172
     173  // Lock because access can be from main thread and DIM handler thread
     174  if ((Ret = pthread_mutex_lock(&Mutex)) != 0) {
     175        Message(FATAL, "pthread_mutex_lock() failed (%s)", strerror(Ret));
     176  }
     177 
     178  for (int i=0; i<List.size(); i++) {
     179        // Alarm level description
     180        Buf << List[i].Server << ": " << (List[i].Level>=0 && List[i].Level<=4 ? LevelStr[List[i].Level] : "unknown");
     181        Buf << " (" << List[i].Level << ")" << endl;
     182
     183        // Adjust master alarm and update server alarm level
     184        if (List[i].Level > Alarm) Alarm = List[i].Level;
     185        List[i].AlarmLevel->updateService(List[i].Level);
     186
     187        // Check if alarm level raised, then send alarm message once
     188        if (List[i].WarnedLevel < List[i].Level && !List[i].Email.empty()) {
     189          List[i].WarnedLevel = List[i].Level;
     190         
     191          // Prepare email message
     192          char *Text;
     193          time_t Time = time(NULL);
     194          if (asprintf(&Text, "echo \"Server alarm level '%s' at %s\"|"
     195                        "mail -s \"Evidence Alarm for '%s'\" %s",
     196                        List[i].Level>=0 && List[i].Level<=4 ? LevelStr[List[i].Level] : "unknown",
     197                        ctime(&Time), List[i].Server.c_str(), List[i].Email.c_str()) != -1) {
     198                system(Text); // Return value depending on OS
     199                free(Text);
     200          }
     201          else Message(ERROR, "Could not send alarm email, asprintf() failed");
     202        }
     203  }
     204 
     205  // Update master alarm services
     206  MasterAlarm = Alarm;   
    139207  Master->updateService();
     208 
     209  // Update alarm description (DIM requires variables to be valid until update)
     210  char *Tmp = new char[Buf.str().size()+1];
     211  strcpy(Tmp, Buf.str().c_str()); 
     212  Summary->updateService(Tmp);
     213
     214  delete[] AlarmText;
     215  AlarmText = Tmp;
     216 
     217  // Unlock
     218  if ((Ret = pthread_mutex_unlock(&Mutex)) != 0) {
     219        Message(FATAL, "pthread_mutex_unlock() failed (%s)", strerror(Ret));
     220  }
    140221}
    141222
     
    146227   
    147228  DimBrowser Browser;
    148   char *ServerName, *Node;
    149   bool Exists;
     229  char *Server, *Node;
     230  bool Exist;
    150231 
    151232  // Static declaration ensures calling of destructor by exit()
     
    154235  // Check periodically if servers are up
    155236  while(!Alarm.ExitRequest) {
    156     for (int i=0; i<Alarm.NumServers; i++) {
    157       Exists = false;
     237
     238    for (int i=0; i<Alarm.List.size(); i++) {
     239      Exist = false;
    158240      Browser.getServers();
    159       while (Browser.getNextServer(ServerName, Node) == 1) {
    160         if (strcmp(ServerName, Alarm.Server[i]) == 0) Exists = true;
     241      while (Browser.getNextServer(Server, Node) == 1) {
     242        if (Alarm.List[i].Server == Server) Exist = true;
    161243      }
    162 
    163       if (Exists) {
    164                 Alarm.Warned[i] = false;
    165                 continue;
    166           }
    167 
    168           Alarm.State[i] = 4;
    169 
    170           // If server unavailable, send alarm message once
    171           if (Alarm.Warned[i] == false) {
    172                 Alarm.Warned[i] = true;
    173                 char *Message;
    174                 time_t Time = time(NULL);
    175                 if (asprintf(&Message, "echo \"Server unavailable at %s\"|mail -s \"Evidence Alarm for '%s'\" %s", ctime(&Time), Alarm.Server[i], Alarm.GetConfig("email","")) != -1) {
    176                   system(Message);
    177                   free(Message);
    178                 }
    179           }
     244          if (!Exist) Alarm.List[i].Level = 4;
     245          else if (Alarm.List[i].Level = -1) Alarm.List[i].Level = 0;
    180246    }
    181247   
  • Evidence/readme.txt

    r224 r227  
    414117/6/2010       Added SendToLog() method. Changed severity encoding of Message service to
    4242                        use standard DIM structure of format "I:1;C"
     4318/6/2010       Alarm server configuration accepts now one email address per server. A new
     44                        service for each observed server SERVERNAME/AlarmLevel contains the highest
     45                        level that occurred in the past. Reset of alarm level only via a DIM command.
     46
    4347
    4448
Note: See TracChangeset for help on using the changeset viewer.