This source file includes following definitions.
- is_matched_failure
- block_failure
- rsc_fail_name
- generate_fail_regex
- generate_fail_regexes
- update_failcount_for_attr
- update_failcount_for_filler
- pe_get_failcount
- pe__clear_failcount
1
2
3
4
5
6
7
8 #include <crm_internal.h>
9
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13
14 #include <crm/crm.h>
15 #include <crm/msg_xml.h>
16 #include <crm/common/xml.h>
17 #include <crm/common/util.h>
18 #include <crm/pengine/internal.h>
19
20 static gboolean
21 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
22 const xmlNode *lrm_op_xml)
23 {
24 gboolean matched = FALSE;
25 const char *conf_op_name = NULL;
26 const char *lrm_op_task = NULL;
27 const char *conf_op_interval_spec = NULL;
28 guint conf_op_interval_ms = 0;
29 guint lrm_op_interval_ms = 0;
30 const char *lrm_op_id = NULL;
31 char *last_failure_key = NULL;
32
33 if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
34 return FALSE;
35 }
36
37
38 conf_op_name = crm_element_value(conf_op_xml, "name");
39 conf_op_interval_spec = crm_element_value(conf_op_xml,
40 XML_LRM_ATTR_INTERVAL);
41 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
42
43
44 lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
45 crm_element_value_ms(lrm_op_xml, XML_LRM_ATTR_INTERVAL_MS,
46 &lrm_op_interval_ms);
47
48 if ((conf_op_interval_ms != lrm_op_interval_ms)
49 || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
50 return FALSE;
51 }
52
53 lrm_op_id = ID(lrm_op_xml);
54 last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
55
56 if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
57 matched = TRUE;
58
59 } else {
60 char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
61 conf_op_interval_ms);
62
63 if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
64 int rc = 0;
65 int target_rc = pe__target_rc_from_xml(lrm_op_xml);
66
67 crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
68 if (rc != target_rc) {
69 matched = TRUE;
70 }
71 }
72 free(expected_op_key);
73 }
74
75 free(last_failure_key);
76 return matched;
77 }
78
79 static gboolean
80 block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
81 const xmlNode *xml_op)
82 {
83 char *xml_name = clone_strip(rsc->id);
84
85
86
87
88
89
90
91
92
93
94
95 char *xpath = crm_strdup_printf("//" XML_CIB_TAG_RESOURCE
96 "[@" XML_ATTR_ID "='%s']"
97 "//" XML_ATTR_OP
98 "[@" XML_OP_ATTR_ON_FAIL "='block']",
99 xml_name);
100
101 xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
102 gboolean should_block = FALSE;
103
104 free(xpath);
105
106 if (xpathObj) {
107 int max = numXpathResults(xpathObj);
108 int lpc = 0;
109
110 for (lpc = 0; lpc < max; lpc++) {
111 xmlNode *pref = getXpathResult(xpathObj, lpc);
112
113 if (xml_op) {
114 should_block = is_matched_failure(xml_name, pref, xml_op);
115 if (should_block) {
116 break;
117 }
118
119 } else {
120 const char *conf_op_name = NULL;
121 const char *conf_op_interval_spec = NULL;
122 guint conf_op_interval_ms = 0;
123 char *lrm_op_xpath = NULL;
124 xmlXPathObject *lrm_op_xpathObj = NULL;
125
126
127 conf_op_name = crm_element_value(pref, "name");
128 conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
129 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
130
131 #define XPATH_FMT "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
132 "//" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \
133 "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='%s']" \
134 "[@" XML_LRM_ATTR_INTERVAL "='%u']"
135
136 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
137 node->details->uname, xml_name,
138 conf_op_name,
139 conf_op_interval_ms);
140 lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
141
142 free(lrm_op_xpath);
143
144 if (lrm_op_xpathObj) {
145 int max2 = numXpathResults(lrm_op_xpathObj);
146 int lpc2 = 0;
147
148 for (lpc2 = 0; lpc2 < max2; lpc2++) {
149 xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
150 lpc2);
151
152 should_block = is_matched_failure(xml_name, pref,
153 lrm_op_xml);
154 if (should_block) {
155 break;
156 }
157 }
158 }
159 freeXpathObject(lrm_op_xpathObj);
160
161 if (should_block) {
162 break;
163 }
164 }
165 }
166 }
167
168 free(xml_name);
169 freeXpathObject(xpathObj);
170
171 return should_block;
172 }
173
174
175
176
177
178
179
180
181
182
183 static inline char *
184 rsc_fail_name(const pcmk_resource_t *rsc)
185 {
186 const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
187
188 return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name);
189 }
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205 static int
206 generate_fail_regex(const char *prefix, const char *rsc_name,
207 gboolean is_legacy, gboolean is_unique, regex_t *re)
208 {
209 char *pattern;
210
211
212
213
214 const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
215
216
217
218
219
220
221
222 const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
223
224 pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
225 instance_pattern, op_pattern);
226 if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
227 free(pattern);
228 return EINVAL;
229 }
230
231 free(pattern);
232 return pcmk_rc_ok;
233 }
234
235
236
237
238
239
240
241
242
243
244
245
246
247 static int
248 generate_fail_regexes(const pcmk_resource_t *rsc,
249 regex_t *failcount_re, regex_t *lastfailure_re)
250 {
251 int rc = pcmk_rc_ok;
252 char *rsc_name = rsc_fail_name(rsc);
253 const char *version = crm_element_value(rsc->cluster->input,
254 XML_ATTR_CRM_VERSION);
255
256
257 gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
258
259 if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
260 pcmk_is_set(rsc->flags, pcmk_rsc_unique),
261 failcount_re) != pcmk_rc_ok) {
262 rc = EINVAL;
263
264 } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
265 is_legacy,
266 pcmk_is_set(rsc->flags, pcmk_rsc_unique),
267 lastfailure_re) != pcmk_rc_ok) {
268 rc = EINVAL;
269 regfree(failcount_re);
270 }
271
272 free(rsc_name);
273 return rc;
274 }
275
276
277 struct failcount_data {
278 const pcmk_node_t *node;
279 pcmk_resource_t *rsc;
280 uint32_t flags;
281 const xmlNode *xml_op;
282 regex_t failcount_re;
283 regex_t lastfailure_re;
284 int failcount;
285 time_t last_failure;
286 };
287
288
289
290
291
292
293
294
295
296 static void
297 update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
298 {
299 struct failcount_data *fc_data = user_data;
300
301
302 if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
303 fc_data->failcount = pcmk__add_scores(fc_data->failcount,
304 char2score(value));
305 pe_rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
306 (const char *) key, (const char *) value, fc_data->rsc->id,
307 pcmk_readable_score(fc_data->failcount));
308 return;
309 }
310
311
312 if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
313 0) == 0) {
314 long long last_ll;
315
316 if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
317 fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure,
318 last_ll);
319 }
320 }
321 }
322
323
324
325
326
327
328
329
330 static void
331 update_failcount_for_filler(gpointer data, gpointer user_data)
332 {
333 pcmk_resource_t *filler = data;
334 struct failcount_data *fc_data = user_data;
335 time_t filler_last_failure = 0;
336
337 fc_data->failcount += pe_get_failcount(fc_data->node, filler,
338 &filler_last_failure, fc_data->flags,
339 fc_data->xml_op);
340 fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure);
341 }
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359 int
360 pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc,
361 time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
362 {
363 struct failcount_data fc_data = {
364 .node = node,
365 .rsc = rsc,
366 .flags = flags,
367 .xml_op = xml_op,
368 .failcount = 0,
369 .last_failure = (time_t) 0,
370 };
371
372
373 CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
374 &fc_data.lastfailure_re) == pcmk_rc_ok,
375 return 0);
376 g_hash_table_foreach(node->details->attrs, update_failcount_for_attr,
377 &fc_data);
378 regfree(&(fc_data.failcount_re));
379 regfree(&(fc_data.lastfailure_re));
380
381
382 if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0)
383 && block_failure(node, rsc, xml_op)) {
384
385 pe_warn("Ignoring failure timeout %d for %s "
386 "because it conflicts with on-fail=block",
387 rsc->failure_timeout, rsc->id);
388 rsc->failure_timeout = 0;
389 }
390
391
392 if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
393 && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) {
394
395 time_t now = get_effective_time(rsc->cluster);
396
397 if (now > (fc_data.last_failure + rsc->failure_timeout)) {
398 pe_rsc_debug(rsc, "Failcount for %s on %s expired after %ds",
399 rsc->id, pe__node_name(node), rsc->failure_timeout);
400 fc_data.failcount = 0;
401 }
402 }
403
404
405
406
407
408
409
410
411
412
413
414 if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL)
415 && !pe_rsc_is_bundled(rsc)) {
416
417 g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data);
418 if (fc_data.failcount > 0) {
419 pe_rsc_info(rsc,
420 "Container %s and the resources within it "
421 "have failed %s time%s on %s",
422 rsc->id, pcmk_readable_score(fc_data.failcount),
423 pcmk__plural_s(fc_data.failcount), pe__node_name(node));
424 }
425
426 } else if (fc_data.failcount > 0) {
427 pe_rsc_info(rsc, "%s has failed %s time%s on %s",
428 rsc->id, pcmk_readable_score(fc_data.failcount),
429 pcmk__plural_s(fc_data.failcount), pe__node_name(node));
430 }
431
432 if (last_failure != NULL) {
433 if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
434 *last_failure = fc_data.last_failure;
435 } else {
436 *last_failure = 0;
437 }
438 }
439 return fc_data.failcount;
440 }
441
442
443
444
445
446
447
448
449
450
451
452 pcmk_action_t *
453 pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node,
454 const char *reason, pcmk_scheduler_t *scheduler)
455 {
456 char *key = NULL;
457 pcmk_action_t *clear = NULL;
458
459 CRM_CHECK(rsc && node && reason && scheduler, return NULL);
460
461 key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0);
462 clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
463 scheduler);
464 add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
465 crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
466 rsc->id, pe__node_name(node), reason, clear->uuid);
467 return clear;
468 }