pacemaker  2.0.2-debe490
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
failcounts.c
Go to the documentation of this file.
1 /*
2  * Copyright 2008-2018 Andrew Beekhof <andrew@beekhof.net>
3  *
4  * This source code is licensed under the GNU Lesser General Public License
5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
6  */
7 
8 #include <crm_internal.h>
9 
10 #include <sys/types.h>
11 #include <regex.h>
12 #include <glib.h>
13 
14 #include <crm/crm.h>
15 #include <crm/msg_xml.h>
16 #include <crm/common/xml.h>
17 #include <crm/common/util.h>
18 #include <crm/pengine/internal.h>
19 
20 static gboolean
21 is_matched_failure(const char *rsc_id, xmlNode *conf_op_xml,
22  xmlNode *lrm_op_xml)
23 {
24  gboolean matched = FALSE;
25  const char *conf_op_name = NULL;
26  const char *lrm_op_task = NULL;
27  const char *conf_op_interval_spec = NULL;
28  guint conf_op_interval_ms = 0;
29  guint lrm_op_interval_ms = 0;
30  const char *lrm_op_id = NULL;
31  char *last_failure_key = NULL;
32 
33  if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
34  return FALSE;
35  }
36 
37  // Get name and interval from configured op
38  conf_op_name = crm_element_value(conf_op_xml, "name");
39  conf_op_interval_spec = crm_element_value(conf_op_xml,
41  conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
42 
43  // Get name and interval from op history entry
44  lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
46  &lrm_op_interval_ms);
47 
48  if ((conf_op_interval_ms != lrm_op_interval_ms)
49  || safe_str_neq(conf_op_name, lrm_op_task)) {
50  return FALSE;
51  }
52 
53  lrm_op_id = ID(lrm_op_xml);
54  last_failure_key = generate_op_key(rsc_id, "last_failure", 0);
55 
56  if (safe_str_eq(last_failure_key, lrm_op_id)) {
57  matched = TRUE;
58 
59  } else {
60  char *expected_op_key = generate_op_key(rsc_id, conf_op_name,
61  conf_op_interval_ms);
62 
63  if (safe_str_eq(expected_op_key, lrm_op_id)) {
64  int rc = 0;
65  int target_rc = get_target_rc(lrm_op_xml);
66 
67  crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
68  if (rc != target_rc) {
69  matched = TRUE;
70  }
71  }
72  free(expected_op_key);
73  }
74 
75  free(last_failure_key);
76  return matched;
77 }
78 
79 static gboolean
80 block_failure(node_t *node, resource_t *rsc, xmlNode *xml_op,
81  pe_working_set_t *data_set)
82 {
83  char *xml_name = clone_strip(rsc->id);
84 
85  /* @TODO This xpath search occurs after template expansion, but it is unable
86  * to properly detect on-fail in id-ref, operation meta-attributes, or
87  * op_defaults, or evaluate rules.
88  *
89  * Also, on-fail defaults to block (in unpack_operation()) for stop actions
90  * when stonith is disabled.
91  *
92  * Ideally, we'd unpack the operation before this point, and pass in a
93  * meta-attributes table that takes all that into consideration.
94  */
95  char *xpath = crm_strdup_printf("//primitive[@id='%s']//op[@on-fail='block']",
96  xml_name);
97 
98  xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
99  gboolean should_block = FALSE;
100 
101  free(xpath);
102 
103  if (xpathObj) {
104  int max = numXpathResults(xpathObj);
105  int lpc = 0;
106 
107  for (lpc = 0; lpc < max; lpc++) {
108  xmlNode *pref = getXpathResult(xpathObj, lpc);
109 
110  if (xml_op) {
111  should_block = is_matched_failure(xml_name, pref, xml_op);
112  if (should_block) {
113  break;
114  }
115 
116  } else {
117  const char *conf_op_name = NULL;
118  const char *conf_op_interval_spec = NULL;
119  guint conf_op_interval_ms = 0;
120  char *lrm_op_xpath = NULL;
121  xmlXPathObject *lrm_op_xpathObj = NULL;
122 
123  // Get name and interval from configured op
124  conf_op_name = crm_element_value(pref, "name");
125  conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
126  conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
127 
128  lrm_op_xpath = crm_strdup_printf("//node_state[@uname='%s']"
129  "//lrm_resource[@id='%s']"
130  "/lrm_rsc_op[@operation='%s'][@interval='%u']",
131  node->details->uname, xml_name,
132  conf_op_name, conf_op_interval_ms);
133  lrm_op_xpathObj = xpath_search(data_set->input, lrm_op_xpath);
134 
135  free(lrm_op_xpath);
136 
137  if (lrm_op_xpathObj) {
138  int max2 = numXpathResults(lrm_op_xpathObj);
139  int lpc2 = 0;
140 
141  for (lpc2 = 0; lpc2 < max2; lpc2++) {
142  xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
143  lpc2);
144 
145  should_block = is_matched_failure(xml_name, pref,
146  lrm_op_xml);
147  if (should_block) {
148  break;
149  }
150  }
151  }
152  freeXpathObject(lrm_op_xpathObj);
153 
154  if (should_block) {
155  break;
156  }
157  }
158  }
159  }
160 
161  free(xml_name);
162  freeXpathObject(xpathObj);
163 
164  return should_block;
165 }
166 
176 static inline char *
177 rsc_fail_name(resource_t *rsc)
178 {
179  const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
180 
181  return is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
182 }
183 
197 static void
198 generate_fail_regex(const char *prefix, const char *rsc_name,
199  gboolean is_legacy, gboolean is_unique, regex_t *re)
200 {
201  char *pattern;
202 
203  /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
204  * per-operation.
205  */
206  const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
207 
208  /* Ignore instance numbers for anything other than globally unique clones.
209  * Anonymous clone fail counts could contain an instance number if the
210  * clone was initially unique, failed, then was converted to anonymous.
211  * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
212  * clone instance numbers.
213  */
214  const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
215 
216  pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
217  instance_pattern, op_pattern);
218  CRM_LOG_ASSERT(regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) == 0);
219  free(pattern);
220 }
221 
233 static void
234 generate_fail_regexes(resource_t *rsc, pe_working_set_t *data_set,
235  regex_t *failcount_re, regex_t *lastfailure_re)
236 {
237  char *rsc_name = rsc_fail_name(rsc);
238  const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
239  gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
240 
241  generate_fail_regex(CRM_FAIL_COUNT_PREFIX, rsc_name, is_legacy,
242  is_set(rsc->flags, pe_rsc_unique), failcount_re);
243 
244  generate_fail_regex(CRM_LAST_FAILURE_PREFIX, rsc_name, is_legacy,
245  is_set(rsc->flags, pe_rsc_unique), lastfailure_re);
246 
247  free(rsc_name);
248 }
249 
250 int
251 pe_get_failcount(node_t *node, resource_t *rsc, time_t *last_failure,
252  uint32_t flags, xmlNode *xml_op, pe_working_set_t *data_set)
253 {
254  char *key = NULL;
255  const char *value = NULL;
256  regex_t failcount_re, lastfailure_re;
257  int failcount = 0;
258  time_t last = 0;
259  GHashTableIter iter;
260 
261  generate_fail_regexes(rsc, data_set, &failcount_re, &lastfailure_re);
262 
263  /* Resource fail count is sum of all matching operation fail counts */
264  g_hash_table_iter_init(&iter, node->details->attrs);
265  while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
266  if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
267  failcount = merge_weights(failcount, char2score(value));
268  } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
269  last = QB_MAX(last, crm_int_helper(value, NULL));
270  }
271  }
272 
273  regfree(&failcount_re);
274  regfree(&lastfailure_re);
275 
276  if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
277  *last_failure = last;
278  }
279 
280  /* If failure blocks the resource, disregard any failure timeout */
281  if ((failcount > 0) && rsc->failure_timeout
282  && block_failure(node, rsc, xml_op, data_set)) {
283 
284  pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
285  rsc->failure_timeout, rsc->id);
286  rsc->failure_timeout = 0;
287  }
288 
289  /* If all failures have expired, ignore fail count */
290  if (is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
291  && rsc->failure_timeout) {
292 
293  time_t now = get_effective_time(data_set);
294 
295  if (now > (last + rsc->failure_timeout)) {
296  crm_debug("Failcount for %s on %s expired after %ds",
297  rsc->id, node->details->uname, rsc->failure_timeout);
298  failcount = 0;
299  }
300  }
301 
302  /* We never want the fail counts of a bundle container's fillers to
303  * count towards the container's fail count.
304  *
305  * Most importantly, a Pacemaker Remote connection to a bundle container
306  * is a filler of the container, but can reside on a different node than the
307  * container itself. Counting its fail count on its node towards the
308  * container's fail count on that node could lead to attempting to stop the
309  * container on the wrong node.
310  */
311 
312  if (is_set(flags, pe_fc_fillers) && rsc->fillers
313  && !pe_rsc_is_bundled(rsc)) {
314 
315  GListPtr gIter = NULL;
316 
317  for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
318  resource_t *filler = (resource_t *) gIter->data;
319  time_t filler_last_failure = 0;
320 
321  failcount += pe_get_failcount(node, filler, &filler_last_failure,
322  flags, xml_op, data_set);
323 
324  if (last_failure && filler_last_failure > *last_failure) {
325  *last_failure = filler_last_failure;
326  }
327  }
328 
329  if (failcount > 0) {
330  char *score = score2char(failcount);
331 
332  crm_info("Container %s and the resources within it have failed %s times on %s",
333  rsc->id, score, node->details->uname);
334  free(score);
335  }
336 
337  } else if (failcount > 0) {
338  char *score = score2char(failcount);
339 
340  crm_info("%s has failed %s times on %s",
341  rsc->id, score, node->details->uname);
342  free(score);
343  }
344 
345 
346  return failcount;
347 }
348 
359 pe_action_t *
361  const char *reason, pe_working_set_t *data_set)
362 {
363  char *key = NULL;
364  action_t *clear = NULL;
365 
366  CRM_CHECK(rsc && node && reason && data_set, return NULL);
367 
368  key = generate_op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
369  clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE,
370  data_set);
372  crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
373  rsc->id, node->details->uname, reason, clear->uuid);
374  return clear;
375 }
#define CRM_CHECK(expr, failure_action)
Definition: logging.h:156
A dumping ground.
#define crm_notice(fmt, args...)
Definition: logging.h:242
GHashTable * attrs
Definition: pe_types.h:204
gboolean safe_str_neq(const char *a, const char *b)
Definition: strings.c:157
#define CRM_LAST_FAILURE_PREFIX
Definition: internal.h:133
xmlNode * xml
Definition: pe_types.h:286
long long crm_int_helper(const char *text, char **end_text)
Definition: strings.c:34
int char2score(const char *score)
Definition: utils.c:199
#define pe_rsc_unique
Definition: pe_types.h:223
#define XML_LRM_ATTR_INTERVAL
Definition: msg_xml.h:254
GListPtr fillers
Definition: pe_types.h:344
time_t get_effective_time(pe_working_set_t *data_set)
Definition: utils.c:1725
#define CRM_LOG_ASSERT(expr)
Definition: logging.h:142
guint crm_parse_interval_spec(const char *input)
Definition: utils.c:542
int get_target_rc(xmlNode *xml_op)
Definition: unpack.c:3016
int crm_element_value_int(const xmlNode *data, const char *name, int *dest)
Retrieve the integer value of an XML attribute.
Definition: nvpair.c:459
#define XML_LRM_ATTR_TASK
Definition: msg_xml.h:260
#define CRM_OP_CLEAR_FAILCOUNT
Definition: crm.h:129
#define pe_warn(fmt...)
Definition: internal.h:21
int crm_element_value_ms(const xmlNode *data, const char *name, guint *dest)
Retrieve the millisecond value of an XML attribute.
Definition: nvpair.c:484
#define crm_debug(fmt, args...)
Definition: logging.h:245
#define CRM_FAIL_COUNT_PREFIX
Definition: internal.h:132
Utility functions.
const char * crm_element_value(const xmlNode *data, const char *name)
Retrieve the value of an XML attribute.
Definition: nvpair.c:423
#define XML_BOOLEAN_TRUE
Definition: msg_xml.h:107
int failure_timeout
Definition: pe_types.h:306
char * clone_strip(const char *last_rsc_id)
Definition: unpack.c:1512
GHashTable * meta
Definition: pe_types.h:379
struct pe_node_shared_s * details
Definition: pe_types.h:213
unsigned long long flags
Definition: pe_types.h:311
const char * uname
Definition: pe_types.h:179
Wrappers for and extensions to libxml2.
#define XML_ATTR_TE_NOWAIT
Definition: msg_xml.h:359
char * clone_name
Definition: pe_types.h:285
action_t * custom_action(resource_t *rsc, char *key, const char *task, node_t *on_node, gboolean optional, gboolean foo, pe_working_set_t *data_set)
Definition: utils.c:441
char * uuid
Definition: pe_types.h:370
xmlNode * input
Definition: pe_types.h:113
#define CRM_XS
Definition: logging.h:34
void add_hash_param(GHashTable *hash, const char *name, const char *value)
Definition: common.c:406
xmlXPathObjectPtr xpath_search(xmlNode *xml_top, const char *path)
Definition: xpath.c:145
pe_action_t * pe__clear_failcount(pe_resource_t *rsc, pe_node_t *node, const char *reason, pe_working_set_t *data_set)
Schedule a controller operation to clear a fail count.
Definition: failcounts.c:360
xmlNode * getXpathResult(xmlXPathObjectPtr xpathObj, int index)
Definition: xpath.c:64
int compare_version(const char *version1, const char *version2)
Definition: utils.c:461
#define XML_LRM_ATTR_INTERVAL_MS
Definition: msg_xml.h:258
int merge_weights(int w1, int w2)
Definition: common.c:369
#define XML_ATTR_CRM_VERSION
Definition: msg_xml.h:79
int pe_get_failcount(node_t *node, resource_t *rsc, time_t *last_failure, uint32_t flags, xmlNode *xml_op, pe_working_set_t *data_set)
Definition: failcounts.c:251
#define XML_LRM_ATTR_RC
Definition: msg_xml.h:271
#define ID(x)
Definition: msg_xml.h:414
#define safe_str_eq(a, b)
Definition: util.h:59
void freeXpathObject(xmlXPathObjectPtr xpathObj)
Definition: xpath.c:45
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
GList * GListPtr
Definition: crm.h:192
#define crm_info(fmt, args...)
Definition: logging.h:243
char * generate_op_key(const char *rsc_id, const char *op_type, guint interval_ms)
Generate an operation key.
Definition: operations.c:39
uint32_t version
Definition: remote.c:146
uint64_t flags
Definition: remote.c:148
char * score2char(int score)
Definition: utils.c:251
char * id
Definition: pe_types.h:284