source: fact/Evidence/Alarm.cc@ 10531

Last change on this file since 10531 was 10143, checked in by ogrimm, 14 years ago
Non-blocking configuration request did not recognize default value
File size: 7.3 KB
Line 
1/********************************************************************\
2
3 Alarm handler of the Evidence Control System
4
5 - Checks periodically if all required servers are up
6 - Listens to the 'Message' service of each server and generates new service for
7 each observed server indicating the maximum Severity in the past.
8 - Maximum severity may be reset by a command 'Alarm/ResetAlarm' for a server.
9 - A text describing the current state of all servers is published as DIM service.
10 - A master alarm (indicating most severe of individual alarms) is published.
11 - The server can be switched on/off with the command 'Alarm/Switch'.
12
13 A mutex is used because UpdateAlarmSummary() may be called from DIM handler thread and
14 from main thread.
15
16 Oliver Grimm, February 2011
17
18\********************************************************************/
19
20#define SERVER_NAME "Alarm"
21#include "Evidence.h"
22
23#include <sstream>
24
25using namespace std;
26
27const int MIN_PERIOD = 5; // Minimum period in seconds for checking servers are alive
28const int UNAVA = 255; // Alarm level to use if server unavailable
29
30//
31// Class declaration
32//
33class AlarmHandler: public DimClient, public EvidenceServer {
34
35 DimCommand *ResetCommand;
36 DimCommand *SwitchCommand;
37 DimService *Summary, *Master;
38 char *AlarmText;
39 int MasterAlarm;
40
41 void infoHandler();
42 void commandHandler();
43
44 public:
45 AlarmHandler();
46 ~AlarmHandler();
47
48 struct Item {
49 string Server;
50 string Email;
51 DimStampedInfo *Subscription;
52 DimService *AlarmLevel;
53 int WarnedLevel;
54 int Level;
55 };
56 vector<struct Item> List;
57 bool Active;
58
59 void UpdateAlarmSummary();
60};
61
62// Constructor
63AlarmHandler::AlarmHandler(): EvidenceServer(SERVER_NAME) {
64
65 struct Item N;
66 static int InitLevel = -1; // static for DIM service below
67
68 // Initialise
69 MasterAlarm = 0;
70 AlarmText = NULL;
71 Active = true;
72
73 // Handling of servies will only start after start()
74 autoStartOff();
75
76 // Create DIM services
77 Summary = new DimService(SERVER_NAME"/Summary", (char *) "not yet available");
78 Master = new DimService(SERVER_NAME"/MasterAlarm", MasterAlarm);
79
80 // Get DIM servers to observe
81 vector<string> Token = Tokenize(GetConfig("servers"));
82
83 for (int i=0; i<Token.size(); i++) {
84 // Extract server name and email
85 vector<string> A = Tokenize(Token[i], ":");
86 N.Server = A[0];
87 if (A.size() == 2) N.Email = A[1];
88 else N.Email = string();
89
90 // DIS_DNS has no Message service
91 if (N.Server == "DIS_DNS") N.Subscription = NULL;
92 else N.Subscription = new DimStampedInfo((N.Server+"/Message").c_str(), NO_LINK, this);
93
94 // Alarm service for server (reference to variable will be updated in UpdateAlarmSummary())
95 N.WarnedLevel = 0;
96 N.Level = -1;
97 N.AlarmLevel = new DimService((N.Server+"/AlarmLevel").c_str(), InitLevel);
98
99 List.push_back(N);
100 }
101
102 // Provide command to reset Level
103 ResetCommand = new DimCommand(SERVER_NAME"/ResetAlarm", (char *) "C", this);
104 SwitchCommand = new DimCommand(SERVER_NAME"/Switch", (char *) "C", this);
105
106 // List set up, can start handling
107 start(SERVER_NAME);
108}
109
110
111// Destructor
112AlarmHandler::~AlarmHandler() {
113
114 delete SwitchCommand;
115 delete ResetCommand;
116
117 for (int i=0; i<List.size(); i++) {
118 delete List[i].Subscription;
119 delete List[i].AlarmLevel;
120 }
121 delete Master;
122 delete Summary;
123 delete[] AlarmText;
124}
125
126
127// Print messages of status changes to screen and update status string
128void AlarmHandler::infoHandler() {
129
130 // Check if alarm server active
131 if (!Active) return;
132
133 // Identify status service
134 for (int i=0; i<List.size(); i++) if (getInfo() == List[i].Subscription) {
135 // Update level: unavailable or current severity of status (safely extracted)
136 if (!ServiceOK(getInfo())) List[i].Level = UNAVA;
137 else {
138 int Severity = atoi(ToString(getInfo()->getFormat(), getInfo()->getData(), getInfo()->getSize()).c_str());
139 if ((Severity>List[i].Level) || (List[i].Level==UNAVA && Severity==0)) List[i].Level = Severity;
140 }
141 }
142
143 UpdateAlarmSummary();
144}
145
146
147// Handle commands
148void AlarmHandler::commandHandler() {
149
150 string Text = ToString((char *) "C", getCommand()->getData(), getCommand()->getSize());
151
152 // Reset alarm level, publish/log action and reset server message severity
153 if (getCommand() == ResetCommand) {
154 for (int i=0; i<List.size(); i++) if (List[i].Server == Text) {
155 Message(INFO, "Alarm level of server %s reset by %s (ID %d)", Text.c_str(), getClientName(), getClientId());
156 List[i].Level = 0;
157 List[i].WarnedLevel = 0;
158 sendCommandNB((Text+"/ResetMessage").c_str(), (int) 0);
159 }
160 }
161
162 // Switch Alarm server on/off and publish/log action
163 if (getCommand() == SwitchCommand) {
164 if (Text == "off") Active = false;
165 else Active = true;
166
167 Message(INFO, "Alarm server switched %s by %s (ID %d)", Active ? "ON":"OFF", getClientName(), getClientId());
168 }
169
170 UpdateAlarmSummary();
171}
172
173
174// Update alarm status summary (locking since access can be from main thread and DIM handler threads)
175void AlarmHandler::UpdateAlarmSummary() {
176
177 ostringstream Buf;
178 string Desc;
179 int Alarm = -1, Ret;
180
181 Lock();
182
183 if (!Active) Buf << "Alarm server inactive";
184 else for (int i=0; i<List.size(); i++) {
185 // Alarm level description
186 Buf << List[i].Server << ": ";
187 switch (List[i].Level) {
188 case INFO: Desc = "OK"; break;
189 case WARN: Desc = "WARN"; break;
190 case ERROR: Desc = "ERROR"; break;
191 case FATAL: Desc = "FATAL"; break;
192 case UNAVA: Desc = "UNAVAILABLE"; break;
193 default: Desc = "?"; break;
194 }
195 Buf << Desc << " (" << List[i].Level << ")" << endl;
196
197 // Adjust master alarm and update server alarm level
198 if (List[i].Level > Alarm) Alarm = List[i].Level;
199 List[i].AlarmLevel->updateService(List[i].Level);
200
201 // Check if alarm level raised, then send alarm message once
202 if (List[i].WarnedLevel < List[i].Level && !List[i].Email.empty()) {
203 List[i].WarnedLevel = List[i].Level;
204
205 // Prepare email message
206 char *Text;
207 time_t Time = time(NULL);
208 if (asprintf(&Text, "echo \"Server alarm level '%s' (%d) at %s\"|"
209 "mail -s \"Evidence Alarm for '%s'\" %s",
210 Desc.c_str(), List[i].Level, ctime(&Time), List[i].Server.c_str(), List[i].Email.c_str()) != -1) {
211 system(Text); // Return value depending on OS
212 free(Text);
213 }
214 else Message(ERROR, "Could not send alarm email, asprintf() failed");
215 }
216 }
217
218 // Update master alarm services
219 MasterAlarm = Alarm;
220 Master->updateService();
221
222 // Update alarm description (DIM requires variables to be valid until update)
223 char *Tmp = new char[Buf.str().size()+1];
224 strcpy(Tmp, Buf.str().c_str());
225 Summary->updateService(Tmp);
226
227 delete[] AlarmText;
228 AlarmText = Tmp;
229
230 Unlock();
231}
232
233//
234// Main program
235//
236int main() {
237
238 DimBrowser B;
239 char *Server, *Node;
240 bool Exist;
241
242 // Static declaration ensures calling of destructor by exit()
243 static AlarmHandler A;
244
245 // Verify periodically that servers exist (if Alarm is active)
246 while(!A.ExitRequest) {
247 for (int i=0; i<A.List.size() && A.Active; i++) {
248 // Check if server exists
249 Exist = false;
250 B.getServers();
251 while (B.getNextServer(Server, Node) == 1) {
252 if (A.List[i].Server == Server) Exist = true;
253 }
254 if (!Exist) A.List[i].Level = UNAVA;
255
256 // Check if standard service available in case server not yet checked (Level is -1)
257 if (B.getServices((A.List[i].Server+"/VERSION_NUMBER").c_str())>0 && A.List[i].Level==-1) A.List[i].Level = 0;
258 }
259
260 A.UpdateAlarmSummary();
261 sleep(max(atoi(A.GetConfig("period").c_str()), MIN_PERIOD));
262 }
263}
Note: See TracBrowser for help on using the repository browser.