pacemaker  2.1.7-0f7f88312f
Scalable High-Availability cluster resource manager
failcounts.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008-2023 the Pacemaker project contributors
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13 
14 #include <crm/crm.h>
15 #include <crm/msg_xml.h>
16 #include <crm/common/xml.h>
17 #include <crm/common/util.h>
18 #include <crm/pengine/internal.h>
19 
20 static gboolean
21 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
22  const xmlNode *lrm_op_xml)
23 {
24  gboolean matched = FALSE;
25  const char *conf_op_name = NULL;
26  const char *lrm_op_task = NULL;
27  const char *conf_op_interval_spec = NULL;
28  guint conf_op_interval_ms = 0;
29  guint lrm_op_interval_ms = 0;
30  const char *lrm_op_id = NULL;
31  char *last_failure_key = NULL;
32 
33  if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
34  return FALSE;
35  }
36 
37  // Get name and interval from configured op
38  conf_op_name = crm_element_value(conf_op_xml, "name");
39  conf_op_interval_spec = crm_element_value(conf_op_xml,
41  conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
42 
43  // Get name and interval from op history entry
44  lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
46  &lrm_op_interval_ms);
47 
48  if ((conf_op_interval_ms != lrm_op_interval_ms)
49  || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
50  return FALSE;
51  }
52 
53  lrm_op_id = ID(lrm_op_xml);
54  last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
55 
56  if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
57  matched = TRUE;
58 
59  } else {
60  char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
61  conf_op_interval_ms);
62 
63  if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
64  int rc = 0;
65  int target_rc = pe__target_rc_from_xml(lrm_op_xml);
66 
67  crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
68  if (rc != target_rc) {
69  matched = TRUE;
70  }
71  }
72  free(expected_op_key);
73  }
74 
75  free(last_failure_key);
76  return matched;
77 }
78 
79 static gboolean
80 block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
81  const xmlNode *xml_op)
82 {
83  char *xml_name = clone_strip(rsc->id);
84 
85  /* @TODO This xpath search occurs after template expansion, but it is unable
86  * to properly detect on-fail in id-ref, operation meta-attributes, or
87  * op_defaults, or evaluate rules.
88  *
89  * Also, on-fail defaults to block (in unpack_operation()) for stop actions
90  * when stonith is disabled.
91  *
92  * Ideally, we'd unpack the operation before this point, and pass in a
93  * meta-attributes table that takes all that into consideration.
94  */
95  char *xpath = crm_strdup_printf("//" XML_CIB_TAG_RESOURCE
96  "[@" XML_ATTR_ID "='%s']"
97  "//" XML_ATTR_OP
98  "[@" XML_OP_ATTR_ON_FAIL "='block']",
99  xml_name);
100 
101  xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
102  gboolean should_block = FALSE;
103 
104  free(xpath);
105 
106  if (xpathObj) {
107  int max = numXpathResults(xpathObj);
108  int lpc = 0;
109 
110  for (lpc = 0; lpc < max; lpc++) {
111  xmlNode *pref = getXpathResult(xpathObj, lpc);
112 
113  if (xml_op) {
114  should_block = is_matched_failure(xml_name, pref, xml_op);
115  if (should_block) {
116  break;
117  }
118 
119  } else {
120  const char *conf_op_name = NULL;
121  const char *conf_op_interval_spec = NULL;
122  guint conf_op_interval_ms = 0;
123  char *lrm_op_xpath = NULL;
124  xmlXPathObject *lrm_op_xpathObj = NULL;
125 
126  // Get name and interval from configured op
127  conf_op_name = crm_element_value(pref, "name");
128  conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
129  conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
130 
131 #define XPATH_FMT "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
132  "//" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \
133  "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='%s']" \
134  "[@" XML_LRM_ATTR_INTERVAL "='%u']"
135 
136  lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
137  node->details->uname, xml_name,
138  conf_op_name,
139  conf_op_interval_ms);
140  lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
141 
142  free(lrm_op_xpath);
143 
144  if (lrm_op_xpathObj) {
145  int max2 = numXpathResults(lrm_op_xpathObj);
146  int lpc2 = 0;
147 
148  for (lpc2 = 0; lpc2 < max2; lpc2++) {
149  xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
150  lpc2);
151 
152  should_block = is_matched_failure(xml_name, pref,
153  lrm_op_xml);
154  if (should_block) {
155  break;
156  }
157  }
158  }
159  freeXpathObject(lrm_op_xpathObj);
160 
161  if (should_block) {
162  break;
163  }
164  }
165  }
166  }
167 
168  free(xml_name);
169  freeXpathObject(xpathObj);
170 
171  return should_block;
172 }
173 
183 static inline char *
184 rsc_fail_name(const pcmk_resource_t *rsc)
185 {
186  const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
187 
188  return pcmk_is_set(rsc->flags, pcmk_rsc_unique)? strdup(name) : clone_strip(name);
189 }
190 
205 static int
206 generate_fail_regex(const char *prefix, const char *rsc_name,
207  gboolean is_legacy, gboolean is_unique, regex_t *re)
208 {
209  char *pattern;
210 
211  /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
212  * per-operation.
213  */
214  const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
215 
216  /* Ignore instance numbers for anything other than globally unique clones.
217  * Anonymous clone fail counts could contain an instance number if the
218  * clone was initially unique, failed, then was converted to anonymous.
219  * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
220  * clone instance numbers.
221  */
222  const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
223 
224  pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
225  instance_pattern, op_pattern);
226  if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
227  free(pattern);
228  return EINVAL;
229  }
230 
231  free(pattern);
232  return pcmk_rc_ok;
233 }
234 
247 static int
248 generate_fail_regexes(const pcmk_resource_t *rsc,
249  regex_t *failcount_re, regex_t *lastfailure_re)
250 {
251  int rc = pcmk_rc_ok;
252  char *rsc_name = rsc_fail_name(rsc);
253  const char *version = crm_element_value(rsc->cluster->input,
255 
256  // @COMPAT Pacemaker <= 1.1.16 used a single fail count per resource
257  gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
258 
259  if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
261  failcount_re) != pcmk_rc_ok) {
262  rc = EINVAL;
263 
264  } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
265  is_legacy,
267  lastfailure_re) != pcmk_rc_ok) {
268  rc = EINVAL;
269  regfree(failcount_re);
270  }
271 
272  free(rsc_name);
273  return rc;
274 }
275 
276 // Data for fail-count-related iterators
277 struct failcount_data {
278  const pcmk_node_t *node;// Node to check for fail count
279  pcmk_resource_t *rsc; // Resource to check for fail count
280  uint32_t flags; // Fail count flags
281  const xmlNode *xml_op; // History entry for expiration purposes (or NULL)
282  regex_t failcount_re; // Fail count regular expression to match
283  regex_t lastfailure_re; // Last failure regular expression to match
284  int failcount; // Fail count so far
285  time_t last_failure; // Time of most recent failure so far
286 };
287 
296 static void
297 update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
298 {
299  struct failcount_data *fc_data = user_data;
300 
301  // If this is a matching fail count attribute, update fail count
302  if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
303  fc_data->failcount = pcmk__add_scores(fc_data->failcount,
304  char2score(value));
305  pe_rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
306  (const char *) key, (const char *) value, fc_data->rsc->id,
307  pcmk_readable_score(fc_data->failcount));
308  return;
309  }
310 
311  // If this is a matching last failure attribute, update last failure
312  if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
313  0) == 0) {
314  long long last_ll;
315 
316  if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
317  fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure,
318  last_ll);
319  }
320  }
321 }
322 
330 static void
331 update_failcount_for_filler(gpointer data, gpointer user_data)
332 {
333  pcmk_resource_t *filler = data;
334  struct failcount_data *fc_data = user_data;
335  time_t filler_last_failure = 0;
336 
337  fc_data->failcount += pe_get_failcount(fc_data->node, filler,
338  &filler_last_failure, fc_data->flags,
339  fc_data->xml_op);
340  fc_data->last_failure = QB_MAX(fc_data->last_failure, filler_last_failure);
341 }
342 
359 int
361  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
362 {
363  struct failcount_data fc_data = {
364  .node = node,
365  .rsc = rsc,
366  .flags = flags,
367  .xml_op = xml_op,
368  .failcount = 0,
369  .last_failure = (time_t) 0,
370  };
371 
372  // Calculate resource failcount as sum of all matching operation failcounts
373  CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
374  &fc_data.lastfailure_re) == pcmk_rc_ok,
375  return 0);
376  g_hash_table_foreach(node->details->attrs, update_failcount_for_attr,
377  &fc_data);
378  regfree(&(fc_data.failcount_re));
379  regfree(&(fc_data.lastfailure_re));
380 
381  // If failure blocks the resource, disregard any failure timeout
382  if ((fc_data.failcount > 0) && (rsc->failure_timeout > 0)
383  && block_failure(node, rsc, xml_op)) {
384 
385  pe_warn("Ignoring failure timeout %d for %s "
386  "because it conflicts with on-fail=block",
387  rsc->failure_timeout, rsc->id);
388  rsc->failure_timeout = 0;
389  }
390 
391  // If all failures have expired, ignore fail count
392  if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
393  && (fc_data.last_failure > 0) && (rsc->failure_timeout != 0)) {
394 
395  time_t now = get_effective_time(rsc->cluster);
396 
397  if (now > (fc_data.last_failure + rsc->failure_timeout)) {
398  pe_rsc_debug(rsc, "Failcount for %s on %s expired after %ds",
399  rsc->id, pe__node_name(node), rsc->failure_timeout);
400  fc_data.failcount = 0;
401  }
402  }
403 
404  /* Add the fail count of any filler resources, except that we never want the
405  * fail counts of a bundle container's fillers to count towards the
406  * container's fail count.
407  *
408  * Most importantly, a Pacemaker Remote connection to a bundle container
409  * is a filler of the container, but can reside on a different node than the
410  * container itself. Counting its fail count on its node towards the
411  * container's fail count on that node could lead to attempting to stop the
412  * container on the wrong node.
413  */
414  if (pcmk_is_set(flags, pcmk__fc_fillers) && (rsc->fillers != NULL)
415  && !pe_rsc_is_bundled(rsc)) {
416 
417  g_list_foreach(rsc->fillers, update_failcount_for_filler, &fc_data);
418  if (fc_data.failcount > 0) {
419  pe_rsc_info(rsc,
420  "Container %s and the resources within it "
421  "have failed %s time%s on %s",
422  rsc->id, pcmk_readable_score(fc_data.failcount),
423  pcmk__plural_s(fc_data.failcount), pe__node_name(node));
424  }
425 
426  } else if (fc_data.failcount > 0) {
427  pe_rsc_info(rsc, "%s has failed %s time%s on %s",
428  rsc->id, pcmk_readable_score(fc_data.failcount),
429  pcmk__plural_s(fc_data.failcount), pe__node_name(node));
430  }
431 
432  if (last_failure != NULL) {
433  if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
434  *last_failure = fc_data.last_failure;
435  } else {
436  *last_failure = 0;
437  }
438  }
439  return fc_data.failcount;
440 }
441 
454  const char *reason, pcmk_scheduler_t *scheduler)
455 {
456  char *key = NULL;
457  pcmk_action_t *clear = NULL;
458 
459  CRM_CHECK(rsc && node && reason && scheduler, return NULL);
460 
462  clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
463  scheduler);
465  crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
466  rsc->id, pe__node_name(node), reason, clear->uuid);
467  return clear;
468 }
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:238
A dumping ground.
#define crm_notice(fmt, args...)
Definition: logging.h:383
pcmk_scheduler_t * cluster
Cluster that resource is part of.
Definition: resources.h:412
GHashTable * attrs
Node attributes.
Definition: nodes.h:115
#define pe_rsc_debug(rsc, fmt, args...)
Definition: internal.h:36
char data[0]
Definition: cpg.c:55
const char * pcmk_readable_score(int score)
Return a displayable static string for a score value.
Definition: scores.c:86
const char * name
Definition: cib.c:26
#define XPATH_FMT
int pe__target_rc_from_xml(const xmlNode *xml_op)
Definition: unpack.c:4275
xmlNode * xml
Resource configuration (possibly expanded from template)
Definition: resources.h:404
Implementation of pcmk_action_t.
Definition: actions.h:390
int char2score(const char *score)
Get the integer value of a score string.
Definition: scores.c:36
#define XML_LRM_ATTR_INTERVAL
Definition: msg_xml.h:300
#define PCMK_ACTION_CLEAR_FAILCOUNT
Definition: actions.h:46
#define XML_OP_ATTR_ON_FAIL
Definition: msg_xml.h:268
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
Definition: nvpair.c:483
Implementation of pcmk_scheduler_t.
Definition: scheduler.h:172
#define PCMK__LAST_FAILURE_PREFIX
Definition: internal.h:297
int pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc, time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
Definition: failcounts.c:360
#define XML_LRM_ATTR_TASK
Definition: msg_xml.h:306
#define pe_warn(fmt...)
Definition: internal.h:44
int pcmk__scan_ll(const char *text, long long *result, long long default_value)
Definition: strings.c:97
#define XML_ATTR_OP
Definition: msg_xml.h:161
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
Definition: nvpair.c:540
Implementation of pcmk_resource_t.
Definition: resources.h:399
Utility functions.
#define XML_ATTR_ID
Definition: msg_xml.h:156
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition: nvpair.c:447
#define XML_CIB_TAG_RESOURCE
Definition: msg_xml.h:235
#define XML_BOOLEAN_TRUE
Definition: msg_xml.h:167
int failure_timeout
Failure timeout.
Definition: resources.h:425
char * clone_strip(const char *last_rsc_id)
Definition: unpack.c:1865
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
GHashTable * meta
Meta-attributes relevant to action.
Definition: actions.h:414
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition: util.h:99
struct pe_node_shared_s * details
Basic node information.
Definition: nodes.h:134
unsigned long long flags
Group of enum pcmk_rsc_flags.
Definition: resources.h:429
const char * uname
Node name in cluster.
Definition: nodes.h:68
Wrappers for and extensions to libxml2.
#define XML_ATTR_TE_NOWAIT
Definition: msg_xml.h:418
char * clone_name
Resource instance ID in history.
Definition: resources.h:401
#define PCMK__FAIL_COUNT_PREFIX
Definition: internal.h:296
char * uuid
Action key.
Definition: actions.h:404
char * pcmk__op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key (RESOURCE_ACTION_INTERVAL)
Definition: actions.c:42
Implementation of pcmk_node_t.
Definition: nodes.h:130
xmlNode * input
CIB XML.
Definition: scheduler.h:175
pcmk_action_t * custom_action(pcmk_resource_t *rsc, char *key, const char *task, const pcmk_node_t *on_node, gboolean optional, pcmk_scheduler_t *scheduler)
Create or update an action object.
Definition: pe_actions.c:1117
GList * fillers
Resources contained by this one, if any.
Definition: resources.h:481
#define CRM_XS
Definition: logging.h:56
xmlXPathObjectPtr xpath_search(const xmlNode *xml_top, const char *path)
Definition: xpath.c:139
void add_hash_param(GHashTable *hash, const char *name, const char *value)
Definition: common.c:508
pcmk_scheduler_t * scheduler
char guint crm_parse_interval_spec(const char *input)
Parse milliseconds from a Pacemaker interval specification.
Definition: utils.c:271
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition: xpath.c:58
int compare_version(const char *version1, const char *version2)
Definition: utils.c:189
#define XML_LRM_ATTR_INTERVAL_MS
Definition: msg_xml.h:304
#define XML_ATTR_CRM_VERSION
Definition: msg_xml.h:140
#define pcmk__plural_s(i)
#define XML_LRM_ATTR_RC
Definition: msg_xml.h:317
int pcmk__add_scores(int score1, int score2)
Definition: scores.c:116
#define pe_rsc_trace(rsc, fmt, args...)
Definition: internal.h:37
#define ID(x)
Definition: msg_xml.h:474
pcmk_action_t * pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node, const char *reason, pcmk_scheduler_t *scheduler)
Schedule a controller operation to clear a fail count.
Definition: failcounts.c:453
time_t get_effective_time(pcmk_scheduler_t *scheduler)
Definition: utils.c:396
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition: xpath.c:39
uint32_t version
Definition: remote.c:213
uint64_t flags
Definition: remote.c:215
Whether resource is not an anonymous clone instance.
Definition: resources.h:118
#define pe_rsc_info(rsc, fmt, args...)
Definition: internal.h:35
char * id
Resource ID in configuration.
Definition: resources.h:400