pacemaker  2.1.6-802a72226b
Scalable High-Availability cluster resource manager
failcounts.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008-2023 the Pacemaker project contributors
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13 
14 #include <crm/crm.h>
15 #include <crm/msg_xml.h>
16 #include <crm/common/xml.h>
17 #include <crm/common/util.h>
18 #include <crm/pengine/internal.h>
19 
20 static gboolean
21 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
22  const xmlNode *lrm_op_xml)
23 {
24  gboolean matched = FALSE;
25  const char *conf_op_name = NULL;
26  const char *lrm_op_task = NULL;
27  const char *conf_op_interval_spec = NULL;
28  guint conf_op_interval_ms = 0;
29  guint lrm_op_interval_ms = 0;
30  const char *lrm_op_id = NULL;
31  char *last_failure_key = NULL;
32 
33  if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
34  return FALSE;
35  }
36 
37  // Get name and interval from configured op
38  conf_op_name = crm_element_value(conf_op_xml, "name");
39  conf_op_interval_spec = crm_element_value(conf_op_xml,
41  conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
42 
43  // Get name and interval from op history entry
44  lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
46  &lrm_op_interval_ms);
47 
48  if ((conf_op_interval_ms != lrm_op_interval_ms)
49  || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
50  return FALSE;
51  }
52 
53  lrm_op_id = ID(lrm_op_xml);
54  last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
55 
56  if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
57  matched = TRUE;
58 
59  } else {
60  char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
61  conf_op_interval_ms);
62 
63  if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
64  int rc = 0;
65  int target_rc = pe__target_rc_from_xml(lrm_op_xml);
66 
67  crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
68  if (rc != target_rc) {
69  matched = TRUE;
70  }
71  }
72  free(expected_op_key);
73  }
74 
75  free(last_failure_key);
76  return matched;
77 }
78 
79 static gboolean
80 block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op)
81 {
82  char *xml_name = clone_strip(rsc->id);
83 
84  /* @TODO This xpath search occurs after template expansion, but it is unable
85  * to properly detect on-fail in id-ref, operation meta-attributes, or
86  * op_defaults, or evaluate rules.
87  *
88  * Also, on-fail defaults to block (in unpack_operation()) for stop actions
89  * when stonith is disabled.
90  *
91  * Ideally, we'd unpack the operation before this point, and pass in a
92  * meta-attributes table that takes all that into consideration.
93  */
94  char *xpath = crm_strdup_printf("//" XML_CIB_TAG_RESOURCE
95  "[@" XML_ATTR_ID "='%s']"
96  "//" XML_ATTR_OP
97  "[@" XML_OP_ATTR_ON_FAIL "='block']",
98  xml_name);
99 
100  xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
101  gboolean should_block = FALSE;
102 
103  free(xpath);
104 
105  if (xpathObj) {
106  int max = numXpathResults(xpathObj);
107  int lpc = 0;
108 
109  for (lpc = 0; lpc < max; lpc++) {
110  xmlNode *pref = getXpathResult(xpathObj, lpc);
111 
112  if (xml_op) {
113  should_block = is_matched_failure(xml_name, pref, xml_op);
114  if (should_block) {
115  break;
116  }
117 
118  } else {
119  const char *conf_op_name = NULL;
120  const char *conf_op_interval_spec = NULL;
121  guint conf_op_interval_ms = 0;
122  char *lrm_op_xpath = NULL;
123  xmlXPathObject *lrm_op_xpathObj = NULL;
124 
125  // Get name and interval from configured op
126  conf_op_name = crm_element_value(pref, "name");
127  conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
128  conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
129 
130 #define XPATH_FMT "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']" \
131  "//" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']" \
132  "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='%s']" \
133  "[@" XML_LRM_ATTR_INTERVAL "='%u']"
134 
135  lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
136  node->details->uname, xml_name,
137  conf_op_name,
138  conf_op_interval_ms);
139  lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
140 
141  free(lrm_op_xpath);
142 
143  if (lrm_op_xpathObj) {
144  int max2 = numXpathResults(lrm_op_xpathObj);
145  int lpc2 = 0;
146 
147  for (lpc2 = 0; lpc2 < max2; lpc2++) {
148  xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
149  lpc2);
150 
151  should_block = is_matched_failure(xml_name, pref,
152  lrm_op_xml);
153  if (should_block) {
154  break;
155  }
156  }
157  }
158  freeXpathObject(lrm_op_xpathObj);
159 
160  if (should_block) {
161  break;
162  }
163  }
164  }
165  }
166 
167  free(xml_name);
168  freeXpathObject(xpathObj);
169 
170  return should_block;
171 }
172 
182 static inline char *
183 rsc_fail_name(const pe_resource_t *rsc)
184 {
185  const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
186 
187  return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
188 }
189 
204 static int
205 generate_fail_regex(const char *prefix, const char *rsc_name,
206  gboolean is_legacy, gboolean is_unique, regex_t *re)
207 {
208  char *pattern;
209 
210  /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
211  * per-operation.
212  */
213  const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
214 
215  /* Ignore instance numbers for anything other than globally unique clones.
216  * Anonymous clone fail counts could contain an instance number if the
217  * clone was initially unique, failed, then was converted to anonymous.
218  * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
219  * clone instance numbers.
220  */
221  const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
222 
223  pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
224  instance_pattern, op_pattern);
225  if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
226  free(pattern);
227  return EINVAL;
228  }
229 
230  free(pattern);
231  return pcmk_rc_ok;
232 }
233 
247 static int
248 generate_fail_regexes(const pe_resource_t *rsc,
249  const pe_working_set_t *data_set,
250  regex_t *failcount_re, regex_t *lastfailure_re)
251 {
252  char *rsc_name = rsc_fail_name(rsc);
254  gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
255  int rc = pcmk_rc_ok;
256 
257  if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
259  failcount_re) != pcmk_rc_ok) {
260  rc = EINVAL;
261 
262  } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
263  is_legacy,
265  lastfailure_re) != pcmk_rc_ok) {
266  rc = EINVAL;
267  regfree(failcount_re);
268  }
269 
270  free(rsc_name);
271  return rc;
272 }
273 
274 int
276  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
277 {
278  char *key = NULL;
279  const char *value = NULL;
280  regex_t failcount_re, lastfailure_re;
281  int failcount = 0;
282  time_t last = 0;
283  GHashTableIter iter;
284 
285  CRM_CHECK(generate_fail_regexes(rsc, rsc->cluster, &failcount_re,
286  &lastfailure_re) == pcmk_rc_ok,
287  return 0);
288 
289  /* Resource fail count is sum of all matching operation fail counts */
290  g_hash_table_iter_init(&iter, node->details->attrs);
291  while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
292  if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
293  failcount = pcmk__add_scores(failcount, char2score(value));
294  crm_trace("Added %s (%s) to %s fail count (now %s)",
295  key, value, rsc->id, pcmk_readable_score(failcount));
296  } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
297  long long last_ll;
298 
299  if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
300  last = (time_t) QB_MAX(last, last_ll);
301  }
302  }
303  }
304 
305  regfree(&failcount_re);
306  regfree(&lastfailure_re);
307 
308  if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
309  *last_failure = last;
310  }
311 
312  /* If failure blocks the resource, disregard any failure timeout */
313  if ((failcount > 0) && rsc->failure_timeout
314  && block_failure(node, rsc, xml_op)) {
315 
316  pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
317  rsc->failure_timeout, rsc->id);
318  rsc->failure_timeout = 0;
319  }
320 
321  /* If all failures have expired, ignore fail count */
322  if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
323  && rsc->failure_timeout) {
324 
325  time_t now = get_effective_time(rsc->cluster);
326 
327  if (now > (last + rsc->failure_timeout)) {
328  crm_debug("Failcount for %s on %s expired after %ds",
329  rsc->id, pe__node_name(node), rsc->failure_timeout);
330  failcount = 0;
331  }
332  }
333 
334  /* We never want the fail counts of a bundle container's fillers to
335  * count towards the container's fail count.
336  *
337  * Most importantly, a Pacemaker Remote connection to a bundle container
338  * is a filler of the container, but can reside on a different node than the
339  * container itself. Counting its fail count on its node towards the
340  * container's fail count on that node could lead to attempting to stop the
341  * container on the wrong node.
342  */
343 
344  if (pcmk_is_set(flags, pe_fc_fillers) && rsc->fillers
345  && !pe_rsc_is_bundled(rsc)) {
346 
347  GList *gIter = NULL;
348 
349  for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
350  pe_resource_t *filler = (pe_resource_t *) gIter->data;
351  time_t filler_last_failure = 0;
352 
353  failcount += pe_get_failcount(node, filler, &filler_last_failure,
354  flags, xml_op);
355 
356  if (last_failure && filler_last_failure > *last_failure) {
357  *last_failure = filler_last_failure;
358  }
359  }
360 
361  if (failcount > 0) {
362  crm_info("Container %s and the resources within it "
363  "have failed %s time%s on %s",
364  rsc->id, pcmk_readable_score(failcount),
365  pcmk__plural_s(failcount), pe__node_name(node));
366  }
367 
368  } else if (failcount > 0) {
369  crm_info("%s has failed %s time%s on %s",
370  rsc->id, pcmk_readable_score(failcount),
371  pcmk__plural_s(failcount), pe__node_name(node));
372  }
373 
374  return failcount;
375 }
376 
387 pe_action_t *
389  const char *reason, pe_working_set_t *data_set)
390 {
391  char *key = NULL;
392  pe_action_t *clear = NULL;
393 
394  CRM_CHECK(rsc && node && reason && data_set, return NULL);
395 
396  key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
397  clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE,
398  data_set);
400  crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
401  rsc->id, pe__node_name(node), reason, clear->uuid);
402  return clear;
403 }
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:235
A dumping ground.
#define crm_notice(fmt, args...)
Definition: logging.h:379
GHashTable * attrs
Definition: pe_types.h:257
const char * pcmk_readable_score(int score)
Return a displayable static string for a score value.
Definition: scores.c:86
const char * name
Definition: cib.c:24
#define XPATH_FMT
int pe__target_rc_from_xml(const xmlNode *xml_op)
Definition: unpack.c:4011
xmlNode * xml
Definition: pe_types.h:349
int char2score(const char *score)
Get the integer value of a score string.
Definition: scores.c:36
#define pe_rsc_unique
Definition: pe_types.h:278
#define XML_LRM_ATTR_INTERVAL
Definition: msg_xml.h:309
time_t get_effective_time(pe_working_set_t *data_set)
Definition: utils.c:434
#define XML_OP_ATTR_ON_FAIL
Definition: msg_xml.h:270
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
Definition: nvpair.c:532
#define PCMK__LAST_FAILURE_PREFIX
Definition: internal.h:314
#define XML_LRM_ATTR_TASK
Definition: msg_xml.h:315
#define CRM_OP_CLEAR_FAILCOUNT
Definition: crm.h:153
#define pe_warn(fmt...)
Definition: internal.h:57
int pcmk__scan_ll(const char *text, long long *result, long long default_value)
Definition: strings.c:97
#define XML_ATTR_OP
Definition: msg_xml.h:153
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
Definition: nvpair.c:589
#define crm_debug(fmt, args...)
Definition: logging.h:382
Utility functions.
#define XML_ATTR_ID
Definition: msg_xml.h:147
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition: nvpair.c:496
#define XML_CIB_TAG_RESOURCE
Definition: msg_xml.h:230
#define XML_BOOLEAN_TRUE
Definition: msg_xml.h:159
int failure_timeout
Definition: pe_types.h:368
char * clone_strip(const char *last_rsc_id)
Definition: unpack.c:1658
#define crm_trace(fmt, args...)
Definition: logging.h:383
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
GHashTable * meta
Definition: pe_types.h:447
#define pcmk_is_set(g, f)
Convenience alias for pcmk_all_flags_set(), to check single flag.
Definition: util.h:121
struct pe_node_shared_s * details
Definition: pe_types.h:268
int pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc, time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
Definition: failcounts.c:275
unsigned long long flags
Definition: pe_types.h:373
const char * uname
Definition: pe_types.h:232
pe_working_set_t * data_set
Wrappers for and extensions to libxml2.
#define XML_ATTR_TE_NOWAIT
Definition: msg_xml.h:427
char * clone_name
Definition: pe_types.h:348
#define PCMK__FAIL_COUNT_PREFIX
Definition: internal.h:313
char * uuid
Definition: pe_types.h:438
xmlNode * input
Definition: pe_types.h:160
char * pcmk__op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key (RESOURCE_ACTION_INTERVAL)
Definition: operations.c:42
GList * fillers
Definition: pe_types.h:413
#define CRM_XS
Definition: logging.h:55
pe_action_t * pe__clear_failcount(pe_resource_t *rsc, const pe_node_t *node, const char *reason, pe_working_set_t *data_set)
Schedule a controller operation to clear a fail count.
Definition: failcounts.c:388
void add_hash_param(GHashTable *hash, const char *name, const char *value)
Definition: common.c:500
xmlXPathObjectPtr xpath_search(xmlNode *xml_top, const char *path)
Definition: xpath.c:139
char guint crm_parse_interval_spec(const char *input)
Parse milliseconds from a Pacemaker interval specification.
Definition: utils.c:271
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition: xpath.c:58
int compare_version(const char *version1, const char *version2)
Definition: utils.c:189
#define XML_LRM_ATTR_INTERVAL_MS
Definition: msg_xml.h:313
#define XML_ATTR_CRM_VERSION
Definition: msg_xml.h:131
#define pcmk__plural_s(i)
#define XML_LRM_ATTR_RC
Definition: msg_xml.h:326
pe_working_set_t * cluster
Definition: pe_types.h:353
int pcmk__add_scores(int score1, int score2)
Definition: scores.c:116
#define ID(x)
Definition: msg_xml.h:480
pe_action_t * custom_action(pe_resource_t *rsc, char *key, const char *task, const pe_node_t *on_node, gboolean optional, gboolean foo, pe_working_set_t *data_set)
Create or update an action object.
Definition: pe_actions.c:942
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition: xpath.c:39
#define crm_info(fmt, args...)
Definition: logging.h:380
uint32_t version
Definition: remote.c:213
uint64_t flags
Definition: remote.c:215
char * id
Definition: pe_types.h:347