pacemaker  3.0.0-d8340737c4
Scalable High-Availability cluster resource manager
failcounts.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008-2024 the Pacemaker project contributors
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13 
14 #include <crm/crm.h>
15 #include <crm/common/xml.h>
16 #include <crm/common/util.h>
17 #include <crm/pengine/internal.h>
18 
19 static gboolean
20 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
21  const xmlNode *lrm_op_xml)
22 {
23  gboolean matched = FALSE;
24  const char *conf_op_name = NULL;
25  const char *lrm_op_task = NULL;
26  const char *conf_op_interval_spec = NULL;
27  guint conf_op_interval_ms = 0;
28  guint lrm_op_interval_ms = 0;
29  const char *lrm_op_id = NULL;
30  char *last_failure_key = NULL;
31 
32  if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
33  return FALSE;
34  }
35 
36  // Get name and interval from configured op
37  conf_op_name = crm_element_value(conf_op_xml, PCMK_XA_NAME);
38  conf_op_interval_spec = crm_element_value(conf_op_xml, PCMK_META_INTERVAL);
39  pcmk_parse_interval_spec(conf_op_interval_spec, &conf_op_interval_ms);
40 
41  // Get name and interval from op history entry
42  lrm_op_task = crm_element_value(lrm_op_xml, PCMK_XA_OPERATION);
43  crm_element_value_ms(lrm_op_xml, PCMK_META_INTERVAL, &lrm_op_interval_ms);
44 
45  if ((conf_op_interval_ms != lrm_op_interval_ms)
46  || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
47  return FALSE;
48  }
49 
50  lrm_op_id = pcmk__xe_id(lrm_op_xml);
51  last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
52 
53  if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
54  matched = TRUE;
55 
56  } else {
57  char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
58  conf_op_interval_ms);
59 
60  if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
61  int rc = 0;
62  int target_rc = pe__target_rc_from_xml(lrm_op_xml);
63 
64  crm_element_value_int(lrm_op_xml, PCMK__XA_RC_CODE, &rc);
65  if (rc != target_rc) {
66  matched = TRUE;
67  }
68  }
69  free(expected_op_key);
70  }
71 
72  free(last_failure_key);
73  return matched;
74 }
75 
76 static gboolean
77 block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
78  const xmlNode *xml_op)
79 {
80  char *xml_name = clone_strip(rsc->id);
81 
82  /* @TODO This xpath search occurs after template expansion, but it is unable
83  * to properly detect on-fail in id-ref, operation meta-attributes, or
84  * op_defaults, or evaluate rules.
85  *
86  * Also, PCMK_META_ON_FAIL defaults to PCMK_VALUE_BLOCK (in
87  * unpack_operation()) for stop actions when stonith is disabled.
88  *
89  * Ideally, we'd unpack the operation before this point, and pass in a
90  * meta-attributes table that takes all that into consideration.
91  */
92  char *xpath = crm_strdup_printf("//" PCMK_XE_PRIMITIVE
93  "[@" PCMK_XA_ID "='%s']"
94  "//" PCMK_XE_OP
96  "='" PCMK_VALUE_BLOCK "']",
97  xml_name);
98 
99  xmlXPathObject *xpathObj = xpath_search(rsc->priv->xml, xpath);
100  gboolean should_block = FALSE;
101 
102  free(xpath);
103 
104  if (xpathObj) {
105  int max = numXpathResults(xpathObj);
106  int lpc = 0;
107 
108  for (lpc = 0; lpc < max; lpc++) {
109  xmlNode *pref = getXpathResult(xpathObj, lpc);
110 
111  if (xml_op) {
112  should_block = is_matched_failure(xml_name, pref, xml_op);
113  if (should_block) {
114  break;
115  }
116 
117  } else {
118  const char *conf_op_name = NULL;
119  const char *conf_op_interval_spec = NULL;
120  guint conf_op_interval_ms = 0;
121  char *lrm_op_xpath = NULL;
122  xmlXPathObject *lrm_op_xpathObj = NULL;
123 
124  // Get name and interval from configured op
125  conf_op_name = crm_element_value(pref, PCMK_XA_NAME);
126  conf_op_interval_spec = crm_element_value(pref,
128  pcmk_parse_interval_spec(conf_op_interval_spec,
129  &conf_op_interval_ms);
130 
131 #define XPATH_FMT "//" PCMK__XE_NODE_STATE "[@" PCMK_XA_UNAME "='%s']" \
132  "//" PCMK__XE_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']" \
133  "/" PCMK__XE_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='%s']" \
134  "[@" PCMK_META_INTERVAL "='%u']"
135 
136  lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
137  node->priv->name, xml_name,
138  conf_op_name,
139  conf_op_interval_ms);
140  lrm_op_xpathObj = xpath_search(rsc->priv->scheduler->input,
141  lrm_op_xpath);
142 
143  free(lrm_op_xpath);
144 
145  if (lrm_op_xpathObj) {
146  int max2 = numXpathResults(lrm_op_xpathObj);
147  int lpc2 = 0;
148 
149  for (lpc2 = 0; lpc2 < max2; lpc2++) {
150  xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
151  lpc2);
152 
153  should_block = is_matched_failure(xml_name, pref,
154  lrm_op_xml);
155  if (should_block) {
156  break;
157  }
158  }
159  }
160  freeXpathObject(lrm_op_xpathObj);
161 
162  if (should_block) {
163  break;
164  }
165  }
166  }
167  }
168 
169  free(xml_name);
170  freeXpathObject(xpathObj);
171 
172  return should_block;
173 }
174 
184 static inline char *
185 rsc_fail_name(const pcmk_resource_t *rsc)
186 {
187  const char *name = pcmk__s(rsc->priv->history_id, rsc->id);
188 
189  return pcmk_is_set(rsc->flags, pcmk__rsc_unique)? strdup(name) : clone_strip(name);
190 }
191 
205 static int
206 generate_fail_regex(const char *prefix, const char *rsc_name, bool is_unique,
207  regex_t *re)
208 {
209  char *pattern = NULL;
210  const char *op_pattern = "#.+_[0-9]+";
211 
212  /* Ignore instance numbers for anything other than globally unique clones.
213  * Anonymous clone fail counts could contain an instance number if the
214  * clone was initially unique, failed, then was converted to anonymous.
215  */
216  const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
217 
218  pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
219  instance_pattern, op_pattern);
220  if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
221  free(pattern);
222  return EINVAL;
223  }
224 
225  free(pattern);
226  return pcmk_rc_ok;
227 }
228 
241 static int
242 generate_fail_regexes(const pcmk_resource_t *rsc, regex_t *failcount_re,
243  regex_t *lastfailure_re)
244 {
245  int rc = pcmk_rc_ok;
246  char *rsc_name = rsc_fail_name(rsc);
247 
248  if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name,
250  failcount_re) != pcmk_rc_ok) {
251  rc = EINVAL;
252 
253  } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
255  lastfailure_re) != pcmk_rc_ok) {
256  rc = EINVAL;
257  regfree(failcount_re);
258  }
259 
260  free(rsc_name);
261  return rc;
262 }
263 
264 // Data for fail-count-related iterators
265 struct failcount_data {
266  const pcmk_node_t *node;// Node to check for fail count
267  pcmk_resource_t *rsc; // Resource to check for fail count
268  uint32_t flags; // Fail count flags
269  const xmlNode *xml_op; // History entry for expiration purposes (or NULL)
270  regex_t failcount_re; // Fail count regular expression to match
271  regex_t lastfailure_re; // Last failure regular expression to match
272  int failcount; // Fail count so far
273  time_t last_failure; // Time of most recent failure so far
274 };
275 
284 static void
285 update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
286 {
287  struct failcount_data *fc_data = user_data;
288 
289  // If this is a matching fail count attribute, update fail count
290  if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
291  int score = 0;
292  int rc = pcmk_parse_score(value, &score, 0);
293 
294  if (rc != pcmk_rc_ok) {
295  crm_warn("Ignoring %s for %s "
296  "because '%s' is not a valid fail count: %s",
297  (const char *) key, pcmk__node_name(fc_data->node),
298  value, pcmk_rc_str(rc));
299  return;
300  }
301  fc_data->failcount = pcmk__add_scores(fc_data->failcount, score);
302  pcmk__rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
303  (const char *) key, (const char *) value,
304  fc_data->rsc->id,
305  pcmk_readable_score(fc_data->failcount));
306  return;
307  }
308 
309  // If this is a matching last failure attribute, update last failure
310  if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
311  0) == 0) {
312  long long last_ll;
313  int rc = pcmk__scan_ll(value, &last_ll, 0LL);
314 
315  if (rc != pcmk_rc_ok) {
316  crm_info("Ignoring invalid value '%s' for %s: %s",
317  (const char *) value, (const char *) key, pcmk_rc_str(rc));
318  return;
319  }
320  fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure, last_ll);
321  }
322 }
323 
331 static void
332 update_launched_failcount(gpointer data, gpointer user_data)
333 {
334  pcmk_resource_t *launched = data;
335  struct failcount_data *fc_data = user_data;
336  time_t launched_last_failure = 0;
337 
338  fc_data->failcount += pe_get_failcount(fc_data->node, launched,
339  &launched_last_failure,
340  fc_data->flags, fc_data->xml_op);
341  fc_data->last_failure = QB_MAX(fc_data->last_failure, launched_last_failure);
342 }
343 
344 #define readable_expiration(rsc) \
345  pcmk__readable_interval((rsc)->priv->failure_expiration_ms)
346 
363 int
365  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
366 {
367  struct failcount_data fc_data = {
368  .node = node,
369  .rsc = rsc,
370  .flags = flags,
371  .xml_op = xml_op,
372  .failcount = 0,
373  .last_failure = (time_t) 0,
374  };
375 
376  // Calculate resource failcount as sum of all matching operation failcounts
377  CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
378  &fc_data.lastfailure_re) == pcmk_rc_ok,
379  return 0);
380  g_hash_table_foreach(node->priv->attrs, update_failcount_for_attr,
381  &fc_data);
382  regfree(&(fc_data.failcount_re));
383  regfree(&(fc_data.lastfailure_re));
384 
385  // If failure blocks the resource, disregard any failure timeout
386  if ((fc_data.failcount > 0) && (rsc->priv->failure_expiration_ms > 0)
387  && block_failure(node, rsc, xml_op)) {
388 
389  pcmk__config_warn("Ignoring failure timeout (%s) for %s "
390  "because it conflicts with "
392  readable_expiration(rsc), rsc->id);
393  rsc->priv->failure_expiration_ms = 0;
394  }
395 
396  // If all failures have expired, ignore fail count
397  if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
398  && (fc_data.last_failure > 0)
399  && (rsc->priv->failure_expiration_ms > 0)) {
400 
401  time_t now = get_effective_time(rsc->priv->scheduler);
402  const guint expiration = pcmk__timeout_ms2s(rsc->priv->failure_expiration_ms);
403 
404  if (now > (fc_data.last_failure + expiration)) {
405  pcmk__rsc_debug(rsc, "Failcount for %s on %s expired after %s",
406  rsc->id, pcmk__node_name(node),
407  readable_expiration(rsc));
408  fc_data.failcount = 0;
409  }
410  }
411 
412  /* Add the fail count of any launched resources, except that we never want
413  * the fail counts of a bundle container's launched resources to count
414  * towards the container's fail count.
415  *
416  * Most importantly, a Pacemaker Remote connection to a bundle container
417  * is launched by the container, but can reside on a different node than the
418  * container itself. Counting its fail count on its node towards the
419  * container's fail count on that node could lead to attempting to stop the
420  * container on the wrong node.
421  */
423  && (rsc->priv->launched != NULL) && !pcmk__is_bundled(rsc)) {
424 
425  g_list_foreach(rsc->priv->launched, update_launched_failcount,
426  &fc_data);
427  if (fc_data.failcount > 0) {
428  pcmk__rsc_info(rsc,
429  "Container %s and the resources within it "
430  "have failed %s time%s on %s",
431  rsc->id, pcmk_readable_score(fc_data.failcount),
432  pcmk__plural_s(fc_data.failcount),
433  pcmk__node_name(node));
434  }
435 
436  } else if (fc_data.failcount > 0) {
437  pcmk__rsc_info(rsc, "%s has failed %s time%s on %s",
438  rsc->id, pcmk_readable_score(fc_data.failcount),
439  pcmk__plural_s(fc_data.failcount),
440  pcmk__node_name(node));
441  }
442 
443  if (last_failure != NULL) {
444  if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
445  *last_failure = fc_data.last_failure;
446  } else {
447  *last_failure = 0;
448  }
449  }
450  return fc_data.failcount;
451 }
452 
465  const char *reason, pcmk_scheduler_t *scheduler)
466 {
467  char *key = NULL;
468  pcmk_action_t *clear = NULL;
469 
470  CRM_CHECK(rsc && node && reason && scheduler, return NULL);
471 
473  clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
474  scheduler);
476  crm_notice("Clearing failure of %s on %s because %s " QB_XS " %s",
477  rsc->id, pcmk__node_name(node), reason, clear->uuid);
478  return clear;
479 }
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:213
A dumping ground.
#define crm_notice(fmt, args...)
Definition: logging.h:365
#define PCMK_XA_NAME
Definition: xml_names.h:330
char data[0]
Definition: cpg.c:58
#define PCMK__XA_RC_CODE
const char * name
Definition: cib.c:26
const char * pcmk_readable_score(int score)
Return a displayable static string for a score value.
Definition: scores.c:102
#define PCMK_XE_PRIMITIVE
Definition: xml_names.h:164
#define pcmk__config_warn(fmt...)
#define pcmk__rsc_trace(rsc, fmt, args...)
#define XPATH_FMT
int pe__target_rc_from_xml(const xmlNode *xml_op)
Definition: unpack.c:4277
#define pcmk__rsc_info(rsc, fmt, args...)
#define pcmk__insert_meta(obj, name, value)
#define PCMK_VALUE_BLOCK
Definition: options.h:135
#define PCMK_ACTION_CLEAR_FAILCOUNT
Definition: actions.h:37
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition: results.c:609
#define PCMK_XA_OPERATION
Definition: xml_names.h:349
#define PCMK__LAST_FAILURE_PREFIX
Definition: internal.h:311
#define pcmk__rsc_debug(rsc, fmt, args...)
int pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc, time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
Definition: failcounts.c:364
pcmk__node_private_t * priv
Definition: nodes.h:85
int pcmk__scan_ll(const char *text, long long *result, long long default_value)
Definition: strings.c:92
#define crm_warn(fmt, args...)
Definition: logging.h:362
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
Definition: xml_element.c:1322
pcmk_scheduler_t * scheduler
Utility functions.
char * clone_strip(const char *last_rsc_id)
Definition: unpack.c:1945
int pcmk_parse_interval_spec(const char *input, guint *result_ms)
Parse milliseconds from a Pacemaker interval specification.
Definition: strings.c:452
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition: xml_element.c:1168
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition: util.h:80
pcmk__resource_private_t * priv
Definition: resources.h:61
Wrappers for and extensions to libxml2.
#define PCMK__FAIL_COUNT_PREFIX
Definition: internal.h:310
#define PCMK_VALUE_TRUE
Definition: options.h:218
#define PCMK_XA_ID
Definition: xml_names.h:301
char * pcmk__op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key (RESOURCE_ACTION_INTERVAL)
Definition: actions.c:195
int pcmk__add_scores(int score1, int score2)
Definition: scores.c:159
pcmk_action_t * custom_action(pcmk_resource_t *rsc, char *key, const char *task, const pcmk_node_t *on_node, gboolean optional, pcmk_scheduler_t *scheduler)
Create or update an action object.
Definition: pe_actions.c:1093
xmlXPathObjectPtr xpath_search(const xmlNode *xml_top, const char *path)
Definition: xpath.c:139
#define PCMK_XE_OP
Definition: xml_names.h:146
#define PCMK_META_INTERVAL
Definition: options.h:91
pcmk_scheduler_t * scheduler
#define PCMK__META_OP_NO_WAIT
xmlNode * input
Definition: scheduler.h:81
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition: xpath.c:58
#define PCMK_META_ON_FAIL
Definition: options.h:98
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
Definition: xml_element.c:1201
guint pcmk__timeout_ms2s(guint timeout_ms)
Definition: utils.c:425
GHashTable * attrs
int pcmk_parse_score(const char *score_s, int *score, int default_score)
Parse an integer score from a string.
Definition: scores.c:34
#define pcmk__plural_s(i)
unsigned long long flags
Definition: resources.h:69
pcmk_action_t * pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node, const char *reason, pcmk_scheduler_t *scheduler)
Schedule a controller operation to clear a fail count.
Definition: failcounts.c:464
#define readable_expiration(rsc)
Definition: failcounts.c:344
time_t get_effective_time(pcmk_scheduler_t *scheduler)
Definition: utils.c:402
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition: xpath.c:39
#define crm_info(fmt, args...)
Definition: logging.h:367
uint64_t flags
Definition: remote.c:211
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1