pacemaker 3.0.1-16e74fc4da
Scalable High-Availability cluster resource manager
Loading...
Searching...
No Matches
failcounts.c
Go to the documentation of this file.
1/*
2 * Copyright 2008-2025 the Pacemaker project contributors
3 *
4 * This source code is licensed under the GNU Lesser General Public License
5 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6 */
7
8#include <crm_internal.h>
9
10#include <sys/types.h>
11#include <regex.h>
12
13#include <glib.h>
14#include <libxml/xpath.h> // xmlXPathObject, etc.
15
16#include <crm/crm.h>
17#include <crm/common/xml.h>
18#include <crm/common/util.h>
20
21static gboolean
22is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
23 const xmlNode *lrm_op_xml)
24{
25 gboolean matched = FALSE;
26 const char *conf_op_name = NULL;
27 const char *lrm_op_task = NULL;
28 const char *conf_op_interval_spec = NULL;
29 guint conf_op_interval_ms = 0;
30 guint lrm_op_interval_ms = 0;
31 const char *lrm_op_id = NULL;
32 char *last_failure_key = NULL;
33
34 if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
35 return FALSE;
36 }
37
38 // Get name and interval from configured op
39 conf_op_name = crm_element_value(conf_op_xml, PCMK_XA_NAME);
40 conf_op_interval_spec = crm_element_value(conf_op_xml, PCMK_META_INTERVAL);
41 pcmk_parse_interval_spec(conf_op_interval_spec, &conf_op_interval_ms);
42
43 // Get name and interval from op history entry
44 lrm_op_task = crm_element_value(lrm_op_xml, PCMK_XA_OPERATION);
45 crm_element_value_ms(lrm_op_xml, PCMK_META_INTERVAL, &lrm_op_interval_ms);
46
47 if ((conf_op_interval_ms != lrm_op_interval_ms)
48 || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
49 return FALSE;
50 }
51
52 lrm_op_id = pcmk__xe_id(lrm_op_xml);
53 last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
54
55 if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
56 matched = TRUE;
57
58 } else {
59 char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
60 conf_op_interval_ms);
61
62 if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
63 int rc = 0;
64 int target_rc = pe__target_rc_from_xml(lrm_op_xml);
65
67 if (rc != target_rc) {
68 matched = TRUE;
69 }
70 }
71 free(expected_op_key);
72 }
73
74 free(last_failure_key);
75 return matched;
76}
77
78static gboolean
79block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
80 const xmlNode *xml_op)
81{
82 char *xml_name = clone_strip(rsc->id);
83
84 /* @TODO This xpath search occurs after template expansion, but it is unable
85 * to properly detect on-fail in id-ref, operation meta-attributes, or
86 * op_defaults, or evaluate rules.
87 *
88 * Also, PCMK_META_ON_FAIL defaults to PCMK_VALUE_BLOCK (in
89 * unpack_operation()) for stop actions when stonith is disabled.
90 *
91 * Ideally, we'd unpack the operation before this point, and pass in a
92 * meta-attributes table that takes all that into consideration.
93 */
94 char *xpath = crm_strdup_printf("//" PCMK_XE_PRIMITIVE
95 "[@" PCMK_XA_ID "='%s']"
96 "//" PCMK_XE_OP
98 "='" PCMK_VALUE_BLOCK "']",
99 xml_name);
100
101 xmlXPathObject *xpathObj = pcmk__xpath_search(rsc->priv->xml->doc, xpath);
102 gboolean should_block = FALSE;
103
104 free(xpath);
105
106 if (xpathObj) {
107 int max = pcmk__xpath_num_results(xpathObj);
108 int lpc = 0;
109
110 for (lpc = 0; lpc < max; lpc++) {
111 xmlNode *pref = pcmk__xpath_result(xpathObj, lpc);
112
113 if (xml_op) {
114 should_block = is_matched_failure(xml_name, pref, xml_op);
115 if (should_block) {
116 break;
117 }
118
119 } else {
120 const char *conf_op_name = NULL;
121 const char *conf_op_interval_spec = NULL;
122 guint conf_op_interval_ms = 0;
124 char *lrm_op_xpath = NULL;
125 xmlXPathObject *lrm_op_xpathObj = NULL;
126
127 // Get name and interval from configured op
128 conf_op_name = crm_element_value(pref, PCMK_XA_NAME);
129 conf_op_interval_spec = crm_element_value(pref,
131 pcmk_parse_interval_spec(conf_op_interval_spec,
132 &conf_op_interval_ms);
133
134#define XPATH_FMT "//" PCMK__XE_NODE_STATE "[@" PCMK_XA_UNAME "='%s']" \
135 "//" PCMK__XE_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']" \
136 "/" PCMK__XE_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='%s']" \
137 "[@" PCMK_META_INTERVAL "='%u']"
138
139 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
140 node->priv->name, xml_name,
141 conf_op_name,
142 conf_op_interval_ms);
143 lrm_op_xpathObj = pcmk__xpath_search(scheduler->input->doc,
144 lrm_op_xpath);
145
146 free(lrm_op_xpath);
147
148 if (lrm_op_xpathObj) {
149 int max2 = pcmk__xpath_num_results(lrm_op_xpathObj);
150 int lpc2 = 0;
151
152 for (lpc2 = 0; lpc2 < max2; lpc2++) {
153 xmlNode *lrm_op_xml = NULL;
154
155 lrm_op_xml = pcmk__xpath_result(lrm_op_xpathObj, lpc2);
156 should_block = is_matched_failure(xml_name, pref,
157 lrm_op_xml);
158 if (should_block) {
159 break;
160 }
161 }
162 }
163 xmlXPathFreeObject(lrm_op_xpathObj);
164
165 if (should_block) {
166 break;
167 }
168 }
169 }
170 }
171
172 free(xml_name);
173 xmlXPathFreeObject(xpathObj);
174
175 return should_block;
176}
177
187static inline char *
188rsc_fail_name(const pcmk_resource_t *rsc)
189{
190 const char *name = pcmk__s(rsc->priv->history_id, rsc->id);
191
192 return pcmk_is_set(rsc->flags, pcmk__rsc_unique)? strdup(name) : clone_strip(name);
193}
194
208static int
209generate_fail_regex(const char *prefix, const char *rsc_name, bool is_unique,
210 regex_t *re)
211{
212 char *pattern = NULL;
213 const char *op_pattern = "#.+_[0-9]+";
214
215 /* Ignore instance numbers for anything other than globally unique clones.
216 * Anonymous clone fail counts could contain an instance number if the
217 * clone was initially unique, failed, then was converted to anonymous.
218 */
219 const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
220
221 pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
222 instance_pattern, op_pattern);
223 if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
224 free(pattern);
225 return EINVAL;
226 }
227
228 free(pattern);
229 return pcmk_rc_ok;
230}
231
244static int
245generate_fail_regexes(const pcmk_resource_t *rsc, regex_t *failcount_re,
246 regex_t *lastfailure_re)
247{
248 int rc = pcmk_rc_ok;
249 char *rsc_name = rsc_fail_name(rsc);
250
251 if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name,
253 failcount_re) != pcmk_rc_ok) {
254 rc = EINVAL;
255
256 } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
258 lastfailure_re) != pcmk_rc_ok) {
259 rc = EINVAL;
260 regfree(failcount_re);
261 }
262
263 free(rsc_name);
264 return rc;
265}
266
267// Data for fail-count-related iterators
268struct failcount_data {
269 const pcmk_node_t *node;// Node to check for fail count
270 pcmk_resource_t *rsc; // Resource to check for fail count
271 uint32_t flags; // Fail count flags
272 const xmlNode *xml_op; // History entry for expiration purposes (or NULL)
273 regex_t failcount_re; // Fail count regular expression to match
274 regex_t lastfailure_re; // Last failure regular expression to match
275 int failcount; // Fail count so far
276 time_t last_failure; // Time of most recent failure so far
277};
278
287static void
288update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
289{
290 struct failcount_data *fc_data = user_data;
291
292 // If this is a matching fail count attribute, update fail count
293 if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
294 int score = 0;
295 int rc = pcmk_parse_score(value, &score, 0);
296
297 if (rc != pcmk_rc_ok) {
298 crm_warn("Ignoring %s for %s "
299 "because '%s' is not a valid fail count: %s",
300 (const char *) key, pcmk__node_name(fc_data->node),
301 value, pcmk_rc_str(rc));
302 return;
303 }
304 fc_data->failcount = pcmk__add_scores(fc_data->failcount, score);
305 pcmk__rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
306 (const char *) key, (const char *) value,
307 fc_data->rsc->id,
308 pcmk_readable_score(fc_data->failcount));
309 return;
310 }
311
312 // If this is a matching last failure attribute, update last failure
313 if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
314 0) == 0) {
315 long long last_ll;
316 int rc = pcmk__scan_ll(value, &last_ll, 0LL);
317
318 if (rc != pcmk_rc_ok) {
319 crm_info("Ignoring invalid value '%s' for %s: %s",
320 (const char *) value, (const char *) key, pcmk_rc_str(rc));
321 return;
322 }
323 fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure, last_ll);
324 }
325}
326
334static void
335update_launched_failcount(gpointer data, gpointer user_data)
336{
337 pcmk_resource_t *launched = data;
338 struct failcount_data *fc_data = user_data;
339 time_t launched_last_failure = 0;
340
341 fc_data->failcount += pe_get_failcount(fc_data->node, launched,
342 &launched_last_failure,
343 fc_data->flags, fc_data->xml_op);
344 fc_data->last_failure = QB_MAX(fc_data->last_failure, launched_last_failure);
345}
346
347#define readable_expiration(rsc) \
348 pcmk__readable_interval((rsc)->priv->failure_expiration_ms)
349
366int
368 time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
369{
370 struct failcount_data fc_data = {
371 .node = node,
372 .rsc = rsc,
373 .flags = flags,
374 .xml_op = xml_op,
375 .failcount = 0,
376 .last_failure = (time_t) 0,
377 };
378
379 // Calculate resource failcount as sum of all matching operation failcounts
380 CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
381 &fc_data.lastfailure_re) == pcmk_rc_ok,
382 return 0);
383 g_hash_table_foreach(node->priv->attrs, update_failcount_for_attr,
384 &fc_data);
385 regfree(&(fc_data.failcount_re));
386 regfree(&(fc_data.lastfailure_re));
387
388 // If failure blocks the resource, disregard any failure timeout
389 if ((fc_data.failcount > 0) && (rsc->priv->failure_expiration_ms > 0)
390 && block_failure(node, rsc, xml_op)) {
391
392 pcmk__config_warn("Ignoring failure timeout (%s) for %s "
393 "because it conflicts with "
395 readable_expiration(rsc), rsc->id);
396 rsc->priv->failure_expiration_ms = 0;
397 }
398
399 // If all failures have expired, ignore fail count
400 if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
401 && (fc_data.last_failure > 0)
402 && (rsc->priv->failure_expiration_ms > 0)) {
403
404 time_t now = pcmk__scheduler_epoch_time(rsc->priv->scheduler);
405 const guint expiration = pcmk__timeout_ms2s(rsc->priv->failure_expiration_ms);
406
407 if (now > (fc_data.last_failure + expiration)) {
408 pcmk__rsc_debug(rsc, "Failcount for %s on %s expired after %s",
409 rsc->id, pcmk__node_name(node),
411 fc_data.failcount = 0;
412 }
413 }
414
415 /* Add the fail count of any launched resources, except that we never want
416 * the fail counts of a bundle container's launched resources to count
417 * towards the container's fail count.
418 *
419 * Most importantly, a Pacemaker Remote connection to a bundle container
420 * is launched by the container, but can reside on a different node than the
421 * container itself. Counting its fail count on its node towards the
422 * container's fail count on that node could lead to attempting to stop the
423 * container on the wrong node.
424 */
426 && (rsc->priv->launched != NULL) && !pcmk__is_bundled(rsc)) {
427
428 g_list_foreach(rsc->priv->launched, update_launched_failcount,
429 &fc_data);
430 if (fc_data.failcount > 0) {
431 pcmk__rsc_info(rsc,
432 "Container %s and the resources within it "
433 "have failed %s time%s on %s",
434 rsc->id, pcmk_readable_score(fc_data.failcount),
435 pcmk__plural_s(fc_data.failcount),
436 pcmk__node_name(node));
437 }
438
439 } else if (fc_data.failcount > 0) {
440 pcmk__rsc_info(rsc, "%s has failed %s time%s on %s",
441 rsc->id, pcmk_readable_score(fc_data.failcount),
442 pcmk__plural_s(fc_data.failcount),
443 pcmk__node_name(node));
444 }
445
446 if (last_failure != NULL) {
447 if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
448 *last_failure = fc_data.last_failure;
449 } else {
450 *last_failure = 0;
451 }
452 }
453 return fc_data.failcount;
454}
455
468 const char *reason, pcmk_scheduler_t *scheduler)
469{
470 char *key = NULL;
471 pcmk_action_t *clear = NULL;
472
473 CRM_CHECK(rsc && node && reason && scheduler, return NULL);
474
476 clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
477 scheduler);
479 crm_notice("Clearing failure of %s on %s because %s " QB_XS " %s",
480 rsc->id, pcmk__node_name(node), reason, clear->uuid);
481 return clear;
482}
#define PCMK_ACTION_CLEAR_FAILCOUNT
Definition actions.h:37
char * pcmk__op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key (RESOURCE_ACTION_INTERVAL)
Definition actions.c:225
const char * name
Definition cib.c:26
guint pcmk__timeout_ms2s(guint timeout_ms)
Definition utils.c:429
#define PCMK__LAST_FAILURE_PREFIX
Definition internal.h:300
#define PCMK__FAIL_COUNT_PREFIX
Definition internal.h:299
uint64_t flags
Definition remote.c:3
Utility functions.
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition util.h:80
char data[0]
Definition cpg.c:10
A dumping ground.
pcmk_action_t * pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node, const char *reason, pcmk_scheduler_t *scheduler)
Schedule a controller operation to clear a fail count.
Definition failcounts.c:467
#define readable_expiration(rsc)
Definition failcounts.c:347
int pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc, time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
Definition failcounts.c:367
#define XPATH_FMT
@ pcmk__fc_launched
@ pcmk__fc_effective
#define crm_info(fmt, args...)
Definition logging.h:365
#define crm_warn(fmt, args...)
Definition logging.h:360
#define crm_notice(fmt, args...)
Definition logging.h:363
#define CRM_CHECK(expr, failure_action)
Definition logging.h:213
#define pcmk__config_warn(fmt...)
pcmk_scheduler_t * scheduler
#define pcmk__insert_meta(obj, name, value)
#define PCMK_META_INTERVAL
Definition options.h:92
#define PCMK_META_ON_FAIL
Definition options.h:99
#define PCMK_VALUE_TRUE
Definition options.h:219
#define PCMK_VALUE_BLOCK
Definition options.h:136
#define PCMK__META_OP_NO_WAIT
pcmk_action_t * custom_action(pcmk_resource_t *rsc, char *key, const char *task, const pcmk_node_t *on_node, gboolean optional, pcmk_scheduler_t *scheduler)
Create or update an action object.
char * clone_strip(const char *last_rsc_id)
Definition unpack.c:1954
int pe__target_rc_from_xml(const xmlNode *xml_op)
Definition unpack.c:4292
@ pcmk__rsc_unique
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition results.c:617
@ pcmk_rc_ok
Definition results.h:159
#define pcmk__rsc_info(rsc, fmt, args...)
#define pcmk__rsc_trace(rsc, fmt, args...)
time_t pcmk__scheduler_epoch_time(pcmk_scheduler_t *scheduler)
Definition scheduler.c:300
#define pcmk__rsc_debug(rsc, fmt, args...)
int pcmk_parse_score(const char *score_s, int *score, int default_score)
Parse an integer score from a string.
Definition scores.c:34
const char * pcmk_readable_score(int score)
Return a displayable static string for a score value.
Definition scores.c:102
int pcmk__add_scores(int score1, int score2)
Definition scores.c:159
int pcmk_parse_interval_spec(const char *input, guint *result_ms)
Parse milliseconds from a Pacemaker interval specification.
Definition strings.c:452
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
#define pcmk__plural_s(i)
int pcmk__scan_ll(const char *text, long long *result, long long default_value)
Definition strings.c:92
@ pcmk__str_casei
pcmk_scheduler_t * scheduler
unsigned long long flags
Definition resources.h:69
pcmk__resource_private_t * priv
Definition resources.h:61
xmlNode * input
Definition scheduler.h:81
pcmk__node_private_t * priv
Definition nodes.h:85
Wrappers for and extensions to libxml2.
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
#define PCMK_XA_OPERATION
Definition xml_names.h:349
#define PCMK_XA_ID
Definition xml_names.h:301
#define PCMK_XE_PRIMITIVE
Definition xml_names.h:164
#define PCMK_XA_NAME
Definition xml_names.h:330
#define PCMK_XE_OP
Definition xml_names.h:146
#define PCMK__XA_RC_CODE
xmlXPathObject * pcmk__xpath_search(xmlDoc *doc, const char *path)
Definition xpath.c:137
xmlNode * pcmk__xpath_result(xmlXPathObject *xpath_obj, int index)
Definition xpath.c:65