This source file includes following definitions.
- is_matched_failure
- block_failure
- rsc_fail_name
- generate_fail_regex
- generate_fail_regexes
- update_failcount_for_attr
- update_launched_failcount
- pe_get_failcount
- pe__clear_failcount
1
2
3
4
5
6
7
8 #include <crm_internal.h>
9
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13
14 #include <crm/crm.h>
15 #include <crm/common/xml.h>
16 #include <crm/common/util.h>
17 #include <crm/pengine/internal.h>
18
19 static gboolean
20 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
21 const xmlNode *lrm_op_xml)
22 {
23 gboolean matched = FALSE;
24 const char *conf_op_name = NULL;
25 const char *lrm_op_task = NULL;
26 const char *conf_op_interval_spec = NULL;
27 guint conf_op_interval_ms = 0;
28 guint lrm_op_interval_ms = 0;
29 const char *lrm_op_id = NULL;
30 char *last_failure_key = NULL;
31
32 if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
33 return FALSE;
34 }
35
36
37 conf_op_name = crm_element_value(conf_op_xml, PCMK_XA_NAME);
38 conf_op_interval_spec = crm_element_value(conf_op_xml, PCMK_META_INTERVAL);
39 pcmk_parse_interval_spec(conf_op_interval_spec, &conf_op_interval_ms);
40
41
42 lrm_op_task = crm_element_value(lrm_op_xml, PCMK_XA_OPERATION);
43 crm_element_value_ms(lrm_op_xml, PCMK_META_INTERVAL, &lrm_op_interval_ms);
44
45 if ((conf_op_interval_ms != lrm_op_interval_ms)
46 || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
47 return FALSE;
48 }
49
50 lrm_op_id = pcmk__xe_id(lrm_op_xml);
51 last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
52
53 if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
54 matched = TRUE;
55
56 } else {
57 char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
58 conf_op_interval_ms);
59
60 if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
61 int rc = 0;
62 int target_rc = pe__target_rc_from_xml(lrm_op_xml);
63
64 crm_element_value_int(lrm_op_xml, PCMK__XA_RC_CODE, &rc);
65 if (rc != target_rc) {
66 matched = TRUE;
67 }
68 }
69 free(expected_op_key);
70 }
71
72 free(last_failure_key);
73 return matched;
74 }
75
76 static gboolean
77 block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
78 const xmlNode *xml_op)
79 {
80 char *xml_name = clone_strip(rsc->id);
81
82
83
84
85
86
87
88
89
90
91
92 char *xpath = crm_strdup_printf("//" PCMK_XE_PRIMITIVE
93 "[@" PCMK_XA_ID "='%s']"
94 "//" PCMK_XE_OP
95 "[@" PCMK_META_ON_FAIL
96 "='" PCMK_VALUE_BLOCK "']",
97 xml_name);
98
99 xmlXPathObject *xpathObj = xpath_search(rsc->priv->xml, xpath);
100 gboolean should_block = FALSE;
101
102 free(xpath);
103
104 if (xpathObj) {
105 int max = numXpathResults(xpathObj);
106 int lpc = 0;
107
108 for (lpc = 0; lpc < max; lpc++) {
109 xmlNode *pref = getXpathResult(xpathObj, lpc);
110
111 if (xml_op) {
112 should_block = is_matched_failure(xml_name, pref, xml_op);
113 if (should_block) {
114 break;
115 }
116
117 } else {
118 const char *conf_op_name = NULL;
119 const char *conf_op_interval_spec = NULL;
120 guint conf_op_interval_ms = 0;
121 char *lrm_op_xpath = NULL;
122 xmlXPathObject *lrm_op_xpathObj = NULL;
123
124
125 conf_op_name = crm_element_value(pref, PCMK_XA_NAME);
126 conf_op_interval_spec = crm_element_value(pref,
127 PCMK_META_INTERVAL);
128 pcmk_parse_interval_spec(conf_op_interval_spec,
129 &conf_op_interval_ms);
130
131 #define XPATH_FMT "//" PCMK__XE_NODE_STATE "[@" PCMK_XA_UNAME "='%s']" \
132 "//" PCMK__XE_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']" \
133 "/" PCMK__XE_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='%s']" \
134 "[@" PCMK_META_INTERVAL "='%u']"
135
136 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
137 node->priv->name, xml_name,
138 conf_op_name,
139 conf_op_interval_ms);
140 lrm_op_xpathObj = xpath_search(rsc->priv->scheduler->input,
141 lrm_op_xpath);
142
143 free(lrm_op_xpath);
144
145 if (lrm_op_xpathObj) {
146 int max2 = numXpathResults(lrm_op_xpathObj);
147 int lpc2 = 0;
148
149 for (lpc2 = 0; lpc2 < max2; lpc2++) {
150 xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
151 lpc2);
152
153 should_block = is_matched_failure(xml_name, pref,
154 lrm_op_xml);
155 if (should_block) {
156 break;
157 }
158 }
159 }
160 freeXpathObject(lrm_op_xpathObj);
161
162 if (should_block) {
163 break;
164 }
165 }
166 }
167 }
168
169 free(xml_name);
170 freeXpathObject(xpathObj);
171
172 return should_block;
173 }
174
175
176
177
178
179
180
181
182
183
184 static inline char *
185 rsc_fail_name(const pcmk_resource_t *rsc)
186 {
187 const char *name = pcmk__s(rsc->priv->history_id, rsc->id);
188
189 return pcmk_is_set(rsc->flags, pcmk__rsc_unique)? strdup(name) : clone_strip(name);
190 }
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205 static int
206 generate_fail_regex(const char *prefix, const char *rsc_name, bool is_unique,
207 regex_t *re)
208 {
209 char *pattern = NULL;
210 const char *op_pattern = "#.+_[0-9]+";
211
212
213
214
215
216 const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
217
218 pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
219 instance_pattern, op_pattern);
220 if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
221 free(pattern);
222 return EINVAL;
223 }
224
225 free(pattern);
226 return pcmk_rc_ok;
227 }
228
229
230
231
232
233
234
235
236
237
238
239
240
241 static int
242 generate_fail_regexes(const pcmk_resource_t *rsc, regex_t *failcount_re,
243 regex_t *lastfailure_re)
244 {
245 int rc = pcmk_rc_ok;
246 char *rsc_name = rsc_fail_name(rsc);
247
248 if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name,
249 pcmk_is_set(rsc->flags, pcmk__rsc_unique),
250 failcount_re) != pcmk_rc_ok) {
251 rc = EINVAL;
252
253 } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
254 pcmk_is_set(rsc->flags, pcmk__rsc_unique),
255 lastfailure_re) != pcmk_rc_ok) {
256 rc = EINVAL;
257 regfree(failcount_re);
258 }
259
260 free(rsc_name);
261 return rc;
262 }
263
264
265 struct failcount_data {
266 const pcmk_node_t *node;
267 pcmk_resource_t *rsc;
268 uint32_t flags;
269 const xmlNode *xml_op;
270 regex_t failcount_re;
271 regex_t lastfailure_re;
272 int failcount;
273 time_t last_failure;
274 };
275
276
277
278
279
280
281
282
283
284 static void
285 update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
286 {
287 struct failcount_data *fc_data = user_data;
288
289
290 if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
291 int score = 0;
292 int rc = pcmk_parse_score(value, &score, 0);
293
294 if (rc != pcmk_rc_ok) {
295 crm_warn("Ignoring %s for %s "
296 "because '%s' is not a valid fail count: %s",
297 (const char *) key, pcmk__node_name(fc_data->node),
298 value, pcmk_rc_str(rc));
299 return;
300 }
301 fc_data->failcount = pcmk__add_scores(fc_data->failcount, score);
302 pcmk__rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
303 (const char *) key, (const char *) value,
304 fc_data->rsc->id,
305 pcmk_readable_score(fc_data->failcount));
306 return;
307 }
308
309
310 if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
311 0) == 0) {
312 long long last_ll;
313 int rc = pcmk__scan_ll(value, &last_ll, 0LL);
314
315 if (rc != pcmk_rc_ok) {
316 crm_info("Ignoring invalid value '%s' for %s: %s",
317 (const char *) value, (const char *) key, pcmk_rc_str(rc));
318 return;
319 }
320 fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure, last_ll);
321 }
322 }
323
324
325
326
327
328
329
330
331 static void
332 update_launched_failcount(gpointer data, gpointer user_data)
333 {
334 pcmk_resource_t *launched = data;
335 struct failcount_data *fc_data = user_data;
336 time_t launched_last_failure = 0;
337
338 fc_data->failcount += pe_get_failcount(fc_data->node, launched,
339 &launched_last_failure,
340 fc_data->flags, fc_data->xml_op);
341 fc_data->last_failure = QB_MAX(fc_data->last_failure, launched_last_failure);
342 }
343
344 #define readable_expiration(rsc) \
345 pcmk__readable_interval((rsc)->priv->failure_expiration_ms)
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363 int
364 pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc,
365 time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
366 {
367 struct failcount_data fc_data = {
368 .node = node,
369 .rsc = rsc,
370 .flags = flags,
371 .xml_op = xml_op,
372 .failcount = 0,
373 .last_failure = (time_t) 0,
374 };
375
376
377 CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
378 &fc_data.lastfailure_re) == pcmk_rc_ok,
379 return 0);
380 g_hash_table_foreach(node->priv->attrs, update_failcount_for_attr,
381 &fc_data);
382 regfree(&(fc_data.failcount_re));
383 regfree(&(fc_data.lastfailure_re));
384
385
386 if ((fc_data.failcount > 0) && (rsc->priv->failure_expiration_ms > 0)
387 && block_failure(node, rsc, xml_op)) {
388
389 pcmk__config_warn("Ignoring failure timeout (%s) for %s "
390 "because it conflicts with "
391 PCMK_META_ON_FAIL "=" PCMK_VALUE_BLOCK,
392 readable_expiration(rsc), rsc->id);
393 rsc->priv->failure_expiration_ms = 0;
394 }
395
396
397 if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
398 && (fc_data.last_failure > 0)
399 && (rsc->priv->failure_expiration_ms > 0)) {
400
401 time_t now = get_effective_time(rsc->priv->scheduler);
402 const guint expiration = pcmk__timeout_ms2s(rsc->priv->failure_expiration_ms);
403
404 if (now > (fc_data.last_failure + expiration)) {
405 pcmk__rsc_debug(rsc, "Failcount for %s on %s expired after %s",
406 rsc->id, pcmk__node_name(node),
407 readable_expiration(rsc));
408 fc_data.failcount = 0;
409 }
410 }
411
412
413
414
415
416
417
418
419
420
421
422 if (pcmk_is_set(flags, pcmk__fc_launched)
423 && (rsc->priv->launched != NULL) && !pcmk__is_bundled(rsc)) {
424
425 g_list_foreach(rsc->priv->launched, update_launched_failcount,
426 &fc_data);
427 if (fc_data.failcount > 0) {
428 pcmk__rsc_info(rsc,
429 "Container %s and the resources within it "
430 "have failed %s time%s on %s",
431 rsc->id, pcmk_readable_score(fc_data.failcount),
432 pcmk__plural_s(fc_data.failcount),
433 pcmk__node_name(node));
434 }
435
436 } else if (fc_data.failcount > 0) {
437 pcmk__rsc_info(rsc, "%s has failed %s time%s on %s",
438 rsc->id, pcmk_readable_score(fc_data.failcount),
439 pcmk__plural_s(fc_data.failcount),
440 pcmk__node_name(node));
441 }
442
443 if (last_failure != NULL) {
444 if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
445 *last_failure = fc_data.last_failure;
446 } else {
447 *last_failure = 0;
448 }
449 }
450 return fc_data.failcount;
451 }
452
453
454
455
456
457
458
459
460
461
462
463 pcmk_action_t *
464 pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node,
465 const char *reason, pcmk_scheduler_t *scheduler)
466 {
467 char *key = NULL;
468 pcmk_action_t *clear = NULL;
469
470 CRM_CHECK(rsc && node && reason && scheduler, return NULL);
471
472 key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0);
473 clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
474 scheduler);
475 pcmk__insert_meta(clear, PCMK__META_OP_NO_WAIT, PCMK_VALUE_TRUE);
476 crm_notice("Clearing failure of %s on %s because %s " QB_XS " %s",
477 rsc->id, pcmk__node_name(node), reason, clear->uuid);
478 return clear;
479 }