source: Evidence/Alarm.cc@ 228

Last change on this file since 228 was 227, checked in by ogrimm, 14 years ago
Extended capabilities of Alarm server
File size: 6.7 KB
Line 
1/********************************************************************\
2
3 Alarm handler of the Evidence Control System
4
5 - Checks periodically if all required servers are up
6 - Listens to the 'Message' service of each server and generates new service for
7 each observed server indicating the maximum Severity in the past.
8 - Maximum severity may be reset by a command 'Alarm/ResetAlarm' for a server.
9 - A text describing the current state of all servers is published as DIM service.
10 The states are described in LevelStr[].
11 - A master alarm (indicating most severe of individual alarms) is published.
12
13 A mutex is used because UpdateAlarmSummary() may be called from DIM handler thread and
14 from main thread.
15
16 Oliver Grimm, June 2010
17
18\********************************************************************/
19
20#define SERVER_NAME "Alarm"
21#include "Evidence.h"
22
23#include <sstream>
24
25using namespace std;
26
27const char* LevelStr[] = {"OK", "WARN", "ERROR", "FATAL", "UNAVAILABLE"};
28
29//
30// Data handling class
31//
32class AlarmHandler: public DimClient, public EvidenceServer {
33
34 DimCommand *Command;
35 DimService *Summary, *Master;
36 char *AlarmText;
37 int MasterAlarm;
38 pthread_mutex_t Mutex;
39
40 void infoHandler();
41 void commandHandler();
42
43 public:
44 AlarmHandler();
45 ~AlarmHandler();
46
47 struct Item {
48 string Server;
49 string Email;
50 DimStampedInfo *Subscription;
51 DimService *AlarmLevel;
52 int WarnedLevel;
53 int Level;
54 };
55 vector<struct Item> List;
56
57 void UpdateAlarmSummary();
58};
59
60// Constructor
61AlarmHandler::AlarmHandler(): EvidenceServer(SERVER_NAME) {
62
63 struct Item N;
64 static int InitLevel = -1; // static for DIM service below
65
66 // Initialise
67 MasterAlarm = 0;
68 AlarmText = NULL;
69
70 if (pthread_mutex_init(&Mutex, NULL) != 0) {
71 Message(FATAL, "pthread_mutex_init failed");
72 }
73
74 // Handling of servies will only start after start()
75 autoStartOff();
76
77 // Create DIM services
78 Summary = new DimService(SERVER_NAME"/Summary", (char *) "not yet available");
79 Master = new DimService(SERVER_NAME"/MasterAlarm", MasterAlarm);
80
81 // Get DIM servers to observe
82 char *Token = strtok(GetConfig("servers"), " \t");
83 int Pos;
84 while (Token != NULL) {
85 // Extract server name and email
86 N.Server = Token;
87 Pos = N.Server.find(':');
88 if (Pos > 0 && Pos < N.Server.size()-2) {
89 N.Email = N.Server.substr(Pos+1, string::npos);
90 N.Server = N.Server.substr(0, Pos);
91 }
92 else N.Email = string();
93
94 // DIS_DNS has no Message service
95 if (N.Server == "DIS_DNS") N.Subscription = NULL;
96 else N.Subscription = new DimStampedInfo((N.Server+"/Message").c_str(), NO_LINK, this);
97
98 // Alarm service for server (reference to variable will be updated in UpdateAlarmSummary())
99 N.WarnedLevel = 0;
100 N.Level = -1;
101 N.AlarmLevel = new DimService((N.Server+"/AlarmLevel").c_str(), InitLevel);
102
103 List.push_back(N);
104 Token = strtok(NULL, " \t");
105 }
106
107 // Provide command to reset Level
108 Command = new DimCommand("Alarm/ResetAlarm", (char *) "C", this);
109
110 // List set up, can start handling
111 start(SERVER_NAME);
112}
113
114
115// Destructor
116AlarmHandler::~AlarmHandler() {
117
118 delete Command;
119
120 for (int i=0; i<List.size(); i++) {
121 delete List[i].Subscription;
122 delete List[i].AlarmLevel;
123 }
124 delete Master;
125 delete Summary;
126 delete[] AlarmText;
127
128 pthread_mutex_destroy(&Mutex);
129}
130
131
132// Print messages of status changes to screen and update status string
133void AlarmHandler::infoHandler() {
134
135 // Identify status service
136 for (int i=0; i<List.size(); i++) if (getInfo() == List[i].Subscription) {
137 // Update level: unavailable or current severity of status
138 if (!ServiceOK(getInfo())) List[i].Level = 4;
139 else if (getInfo()->getInt() > List[i].Level) List[i].Level = getInfo()->getInt();
140 }
141
142 UpdateAlarmSummary();
143}
144
145
146// Reset alarm level of given server
147void AlarmHandler::commandHandler() {
148
149 DimCommand *C = getCommand();
150
151 // Check for valid command parameter
152 if (C != Command) return;
153 if (C->getSize() == 0) return;
154 if (*((char *) C->getData() + C->getSize() - 1) != '\0') return;
155
156 // Reset alarm level and publish/log action
157 for (int i=0; i<List.size(); i++) if (List[i].Server == C->getString()) {
158 Message(INFO, "Alarm level of server %s reset by %s (ID %d)", C->getString(), getClientName(), getClientId());
159 List[i].Level = 0;
160 List[i].WarnedLevel = 0;
161 }
162
163 UpdateAlarmSummary();
164}
165
166
167// Update alarm status summary
168void AlarmHandler::UpdateAlarmSummary() {
169
170 ostringstream Buf;
171 int Alarm, Ret;
172
173 // Lock because access can be from main thread and DIM handler thread
174 if ((Ret = pthread_mutex_lock(&Mutex)) != 0) {
175 Message(FATAL, "pthread_mutex_lock() failed (%s)", strerror(Ret));
176 }
177
178 for (int i=0; i<List.size(); i++) {
179 // Alarm level description
180 Buf << List[i].Server << ": " << (List[i].Level>=0 && List[i].Level<=4 ? LevelStr[List[i].Level] : "unknown");
181 Buf << " (" << List[i].Level << ")" << endl;
182
183 // Adjust master alarm and update server alarm level
184 if (List[i].Level > Alarm) Alarm = List[i].Level;
185 List[i].AlarmLevel->updateService(List[i].Level);
186
187 // Check if alarm level raised, then send alarm message once
188 if (List[i].WarnedLevel < List[i].Level && !List[i].Email.empty()) {
189 List[i].WarnedLevel = List[i].Level;
190
191 // Prepare email message
192 char *Text;
193 time_t Time = time(NULL);
194 if (asprintf(&Text, "echo \"Server alarm level '%s' at %s\"|"
195 "mail -s \"Evidence Alarm for '%s'\" %s",
196 List[i].Level>=0 && List[i].Level<=4 ? LevelStr[List[i].Level] : "unknown",
197 ctime(&Time), List[i].Server.c_str(), List[i].Email.c_str()) != -1) {
198 system(Text); // Return value depending on OS
199 free(Text);
200 }
201 else Message(ERROR, "Could not send alarm email, asprintf() failed");
202 }
203 }
204
205 // Update master alarm services
206 MasterAlarm = Alarm;
207 Master->updateService();
208
209 // Update alarm description (DIM requires variables to be valid until update)
210 char *Tmp = new char[Buf.str().size()+1];
211 strcpy(Tmp, Buf.str().c_str());
212 Summary->updateService(Tmp);
213
214 delete[] AlarmText;
215 AlarmText = Tmp;
216
217 // Unlock
218 if ((Ret = pthread_mutex_unlock(&Mutex)) != 0) {
219 Message(FATAL, "pthread_mutex_unlock() failed (%s)", strerror(Ret));
220 }
221}
222
223//
224// Main program
225//
226int main() {
227
228 DimBrowser Browser;
229 char *Server, *Node;
230 bool Exist;
231
232 // Static declaration ensures calling of destructor by exit()
233 static AlarmHandler Alarm;
234
235 // Check periodically if servers are up
236 while(!Alarm.ExitRequest) {
237
238 for (int i=0; i<Alarm.List.size(); i++) {
239 Exist = false;
240 Browser.getServers();
241 while (Browser.getNextServer(Server, Node) == 1) {
242 if (Alarm.List[i].Server == Server) Exist = true;
243 }
244 if (!Exist) Alarm.List[i].Level = 4;
245 else if (Alarm.List[i].Level = -1) Alarm.List[i].Level = 0;
246 }
247
248 Alarm.UpdateAlarmSummary();
249 sleep(atoi(Alarm.GetConfig("period")));
250 }
251}
Note: See TracBrowser for help on using the repository browser.