1 | /********************************************************************\
|
---|
2 |
|
---|
3 | Alarm handler of the Evidence Control System
|
---|
4 |
|
---|
5 | - Checks periodically if all required servers are up
|
---|
6 | - Listens to the 'Message' service of each server and generates new service for
|
---|
7 | each observed server indicating the maximum Severity in the past.
|
---|
8 | - Maximum severity may be reset by a command 'Alarm/ResetAlarm' for a server.
|
---|
9 | - A text describing the current state of all servers is published as DIM service.
|
---|
10 | - A master alarm (indicating most severe of individual alarms) is published.
|
---|
11 | - The server can be switched on/off with the command 'Alarm/Switch'.
|
---|
12 |
|
---|
13 | A mutex is used because UpdateAlarmSummary() may be called from DIM handler thread and
|
---|
14 | from main thread.
|
---|
15 |
|
---|
16 | Oliver Grimm, February 2011
|
---|
17 |
|
---|
18 | \********************************************************************/
|
---|
19 |
|
---|
20 | #define SERVER_NAME "Alarm"
|
---|
21 | #include "Evidence.h"
|
---|
22 |
|
---|
23 | #include <sstream>
|
---|
24 |
|
---|
25 | using namespace std;
|
---|
26 |
|
---|
27 | const int MIN_PERIOD = 5; // Minimum period in seconds for checking servers are alive
|
---|
28 | const int UNAVA = 255; // Alarm level to use if server unavailable
|
---|
29 |
|
---|
30 | //
|
---|
31 | // Class declaration
|
---|
32 | //
|
---|
33 | class AlarmHandler: public DimClient, public EvidenceServer {
|
---|
34 |
|
---|
35 | DimCommand *ResetCommand;
|
---|
36 | DimCommand *SwitchCommand;
|
---|
37 | DimService *Summary, *Master;
|
---|
38 | char *AlarmText;
|
---|
39 | int MasterAlarm;
|
---|
40 |
|
---|
41 | void infoHandler();
|
---|
42 | void commandHandler();
|
---|
43 |
|
---|
44 | public:
|
---|
45 | AlarmHandler();
|
---|
46 | ~AlarmHandler();
|
---|
47 |
|
---|
48 | struct Item {
|
---|
49 | string Server;
|
---|
50 | string Email;
|
---|
51 | DimStampedInfo *Subscription;
|
---|
52 | DimService *AlarmLevel;
|
---|
53 | int WarnedLevel;
|
---|
54 | int Level;
|
---|
55 | };
|
---|
56 | vector<struct Item> List;
|
---|
57 | bool Active;
|
---|
58 |
|
---|
59 | void UpdateAlarmSummary();
|
---|
60 | };
|
---|
61 |
|
---|
62 | // Constructor
|
---|
63 | AlarmHandler::AlarmHandler(): EvidenceServer(SERVER_NAME) {
|
---|
64 |
|
---|
65 | struct Item N;
|
---|
66 | static int InitLevel = -1; // static for DIM service below
|
---|
67 |
|
---|
68 | // Initialise
|
---|
69 | MasterAlarm = 0;
|
---|
70 | AlarmText = NULL;
|
---|
71 | Active = true;
|
---|
72 |
|
---|
73 | // Handling of servies will only start after start()
|
---|
74 | autoStartOff();
|
---|
75 |
|
---|
76 | // Create DIM services
|
---|
77 | Summary = new DimService(SERVER_NAME"/Summary", (char *) "not yet available");
|
---|
78 | Master = new DimService(SERVER_NAME"/MasterAlarm", MasterAlarm);
|
---|
79 |
|
---|
80 | // Get DIM servers to observe
|
---|
81 | vector<string> Token = Tokenize(GetConfig("servers"));
|
---|
82 |
|
---|
83 | for (int i=0; i<Token.size(); i++) {
|
---|
84 | // Extract server name and email
|
---|
85 | vector<string> A = Tokenize(Token[i], ":");
|
---|
86 | N.Server = A[0];
|
---|
87 | if (A.size() == 2) N.Email = A[1];
|
---|
88 | else N.Email = string();
|
---|
89 |
|
---|
90 | // DIS_DNS has no Message service
|
---|
91 | if (N.Server == "DIS_DNS") N.Subscription = NULL;
|
---|
92 | else N.Subscription = new DimStampedInfo((N.Server+"/Message").c_str(), NO_LINK, this);
|
---|
93 |
|
---|
94 | // Alarm service for server (reference to variable will be updated in UpdateAlarmSummary())
|
---|
95 | N.WarnedLevel = 0;
|
---|
96 | N.Level = -1;
|
---|
97 | N.AlarmLevel = new DimService((N.Server+"/AlarmLevel").c_str(), InitLevel);
|
---|
98 |
|
---|
99 | List.push_back(N);
|
---|
100 | }
|
---|
101 |
|
---|
102 | // Provide command to reset Level
|
---|
103 | ResetCommand = new DimCommand(SERVER_NAME"/ResetAlarm", (char *) "C", this);
|
---|
104 | SwitchCommand = new DimCommand(SERVER_NAME"/Switch", (char *) "C", this);
|
---|
105 |
|
---|
106 | // List set up, can start handling
|
---|
107 | start(SERVER_NAME);
|
---|
108 | }
|
---|
109 |
|
---|
110 |
|
---|
111 | // Destructor
|
---|
112 | AlarmHandler::~AlarmHandler() {
|
---|
113 |
|
---|
114 | delete SwitchCommand;
|
---|
115 | delete ResetCommand;
|
---|
116 |
|
---|
117 | for (int i=0; i<List.size(); i++) {
|
---|
118 | delete List[i].Subscription;
|
---|
119 | delete List[i].AlarmLevel;
|
---|
120 | }
|
---|
121 | delete Master;
|
---|
122 | delete Summary;
|
---|
123 | delete[] AlarmText;
|
---|
124 | }
|
---|
125 |
|
---|
126 |
|
---|
127 | // Print messages of status changes to screen and update status string
|
---|
128 | void AlarmHandler::infoHandler() {
|
---|
129 |
|
---|
130 | // Check if alarm server active
|
---|
131 | if (!Active) return;
|
---|
132 |
|
---|
133 | // Identify status service
|
---|
134 | for (int i=0; i<List.size(); i++) if (getInfo() == List[i].Subscription) {
|
---|
135 | // Update level: unavailable or current severity of status (safely extracted)
|
---|
136 | if (!ServiceOK(getInfo())) List[i].Level = UNAVA;
|
---|
137 | else {
|
---|
138 | int Severity = atoi(ToString(getInfo()->getFormat(), getInfo()->getData(), getInfo()->getSize()).c_str());
|
---|
139 | if ((Severity>List[i].Level) || (List[i].Level==UNAVA && Severity==0)) List[i].Level = Severity;
|
---|
140 | }
|
---|
141 | }
|
---|
142 |
|
---|
143 | UpdateAlarmSummary();
|
---|
144 | }
|
---|
145 |
|
---|
146 |
|
---|
147 | // Handle commands
|
---|
148 | void AlarmHandler::commandHandler() {
|
---|
149 |
|
---|
150 | string Text = ToString((char *) "C", getCommand()->getData(), getCommand()->getSize());
|
---|
151 |
|
---|
152 | // Reset alarm level, publish/log action and reset server message severity
|
---|
153 | if (getCommand() == ResetCommand) {
|
---|
154 | for (int i=0; i<List.size(); i++) if (List[i].Server == Text) {
|
---|
155 | Message(INFO, "Alarm level of server %s reset by %s (ID %d)", Text.c_str(), getClientName(), getClientId());
|
---|
156 | List[i].Level = 0;
|
---|
157 | List[i].WarnedLevel = 0;
|
---|
158 | sendCommandNB((Text+"/ResetMessage").c_str(), (int) 0);
|
---|
159 | }
|
---|
160 | }
|
---|
161 |
|
---|
162 | // Switch Alarm server on/off and publish/log action
|
---|
163 | if (getCommand() == SwitchCommand) {
|
---|
164 | if (Text == "off") Active = false;
|
---|
165 | else Active = true;
|
---|
166 |
|
---|
167 | Message(INFO, "Alarm server switched %s by %s (ID %d)", Active ? "ON":"OFF", getClientName(), getClientId());
|
---|
168 | }
|
---|
169 |
|
---|
170 | UpdateAlarmSummary();
|
---|
171 | }
|
---|
172 |
|
---|
173 |
|
---|
174 | // Update alarm status summary (locking since access can be from main thread and DIM handler threads)
|
---|
175 | void AlarmHandler::UpdateAlarmSummary() {
|
---|
176 |
|
---|
177 | ostringstream Buf;
|
---|
178 | string Desc;
|
---|
179 | int Alarm = -1, Ret;
|
---|
180 |
|
---|
181 | Lock();
|
---|
182 |
|
---|
183 | if (!Active) Buf << "Alarm server inactive";
|
---|
184 | else for (int i=0; i<List.size(); i++) {
|
---|
185 | // Alarm level description
|
---|
186 | Buf << List[i].Server << ": ";
|
---|
187 | switch (List[i].Level) {
|
---|
188 | case INFO: Desc = "OK"; break;
|
---|
189 | case WARN: Desc = "WARN"; break;
|
---|
190 | case ERROR: Desc = "ERROR"; break;
|
---|
191 | case FATAL: Desc = "FATAL"; break;
|
---|
192 | case UNAVA: Desc = "UNAVAILABLE"; break;
|
---|
193 | default: Desc = "?"; break;
|
---|
194 | }
|
---|
195 | Buf << Desc << " (" << List[i].Level << ")" << endl;
|
---|
196 |
|
---|
197 | // Adjust master alarm and update server alarm level
|
---|
198 | if (List[i].Level > Alarm) Alarm = List[i].Level;
|
---|
199 | List[i].AlarmLevel->updateService(List[i].Level);
|
---|
200 |
|
---|
201 | // Check if alarm level raised, then send alarm message once
|
---|
202 | if (List[i].WarnedLevel < List[i].Level && !List[i].Email.empty()) {
|
---|
203 | List[i].WarnedLevel = List[i].Level;
|
---|
204 |
|
---|
205 | // Prepare email message
|
---|
206 | char *Text;
|
---|
207 | time_t Time = time(NULL);
|
---|
208 | if (asprintf(&Text, "echo \"Server alarm level '%s' (%d) at %s\"|"
|
---|
209 | "mail -s \"Evidence Alarm for '%s'\" %s",
|
---|
210 | Desc.c_str(), List[i].Level, ctime(&Time), List[i].Server.c_str(), List[i].Email.c_str()) != -1) {
|
---|
211 | system(Text); // Return value depending on OS
|
---|
212 | free(Text);
|
---|
213 | }
|
---|
214 | else Message(ERROR, "Could not send alarm email, asprintf() failed");
|
---|
215 | }
|
---|
216 | }
|
---|
217 |
|
---|
218 | // Update master alarm services
|
---|
219 | MasterAlarm = Alarm;
|
---|
220 | Master->updateService();
|
---|
221 |
|
---|
222 | // Update alarm description (DIM requires variables to be valid until update)
|
---|
223 | char *Tmp = new char[Buf.str().size()+1];
|
---|
224 | strcpy(Tmp, Buf.str().c_str());
|
---|
225 | Summary->updateService(Tmp);
|
---|
226 |
|
---|
227 | delete[] AlarmText;
|
---|
228 | AlarmText = Tmp;
|
---|
229 |
|
---|
230 | Unlock();
|
---|
231 | }
|
---|
232 |
|
---|
233 | //
|
---|
234 | // Main program
|
---|
235 | //
|
---|
236 | int main() {
|
---|
237 |
|
---|
238 | DimBrowser B;
|
---|
239 | char *Server, *Node;
|
---|
240 | bool Exist;
|
---|
241 |
|
---|
242 | // Static declaration ensures calling of destructor by exit()
|
---|
243 | static AlarmHandler A;
|
---|
244 |
|
---|
245 | // Verify periodically that servers exist (if Alarm is active)
|
---|
246 | while(!A.ExitRequest) {
|
---|
247 | for (int i=0; i<A.List.size() && A.Active; i++) {
|
---|
248 | // Check if server exists
|
---|
249 | Exist = false;
|
---|
250 | B.getServers();
|
---|
251 | while (B.getNextServer(Server, Node) == 1) {
|
---|
252 | if (A.List[i].Server == Server) Exist = true;
|
---|
253 | }
|
---|
254 | if (!Exist) A.List[i].Level = UNAVA;
|
---|
255 |
|
---|
256 | // Check if standard service available in case server not yet checked (Level is -1)
|
---|
257 | if (B.getServices((A.List[i].Server+"/VERSION_NUMBER").c_str())>0 && A.List[i].Level==-1) A.List[i].Level = 0;
|
---|
258 | }
|
---|
259 |
|
---|
260 | A.UpdateAlarmSummary();
|
---|
261 | sleep(max(atoi(A.GetConfig("period").c_str()), MIN_PERIOD));
|
---|
262 | }
|
---|
263 | }
|
---|