root/lib/pengine/failcounts.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_matched_failure
  2. block_failure
  3. rsc_fail_name
  4. generate_fail_regex
  5. generate_fail_regexes
  6. pe_get_failcount
  7. pe__clear_failcount

   1 /*
   2  * Copyright 2008-2023 the Pacemaker project contributors
   3  *
   4  * This source code is licensed under the GNU Lesser General Public License
   5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   6  */
   7 
   8 #include <crm_internal.h>
   9 
  10 #include <sys/types.h>
  11 #include <regex.h>
  12 #include <glib.h>
  13 
  14 #include <crm/crm.h>
  15 #include <crm/msg_xml.h>
  16 #include <crm/common/xml.h>
  17 #include <crm/common/util.h>
  18 #include <crm/pengine/internal.h>
  19 
  20 static gboolean
  21 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
     /* [previous][next][first][last][top][bottom][index][help] */
  22                    const xmlNode *lrm_op_xml)
  23 {
  24     gboolean matched = FALSE;
  25     const char *conf_op_name = NULL;
  26     const char *lrm_op_task = NULL;
  27     const char *conf_op_interval_spec = NULL;
  28     guint conf_op_interval_ms = 0;
  29     guint lrm_op_interval_ms = 0;
  30     const char *lrm_op_id = NULL;
  31     char *last_failure_key = NULL;
  32 
  33     if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
  34         return FALSE;
  35     }
  36 
  37     // Get name and interval from configured op
  38     conf_op_name = crm_element_value(conf_op_xml, "name");
  39     conf_op_interval_spec = crm_element_value(conf_op_xml,
  40                                               XML_LRM_ATTR_INTERVAL);
  41     conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
  42 
  43     // Get name and interval from op history entry
  44     lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
  45     crm_element_value_ms(lrm_op_xml, XML_LRM_ATTR_INTERVAL_MS,
  46                          &lrm_op_interval_ms);
  47 
  48     if ((conf_op_interval_ms != lrm_op_interval_ms)
  49         || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
  50         return FALSE;
  51     }
  52 
  53     lrm_op_id = ID(lrm_op_xml);
  54     last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
  55 
  56     if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
  57         matched = TRUE;
  58 
  59     } else {
  60         char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
  61                                                 conf_op_interval_ms);
  62 
  63         if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
  64             int rc = 0;
  65             int target_rc = pe__target_rc_from_xml(lrm_op_xml);
  66 
  67             crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
  68             if (rc != target_rc) {
  69                 matched = TRUE;
  70             }
  71         }
  72         free(expected_op_key);
  73     }
  74 
  75     free(last_failure_key);
  76     return matched;
  77 }
  78 
  79 static gboolean
  80 block_failure(const pe_node_t *node, pe_resource_t *rsc, const xmlNode *xml_op)
     /* [previous][next][first][last][top][bottom][index][help] */
  81 {
  82     char *xml_name = clone_strip(rsc->id);
  83 
  84     /* @TODO This xpath search occurs after template expansion, but it is unable
  85      * to properly detect on-fail in id-ref, operation meta-attributes, or
  86      * op_defaults, or evaluate rules.
  87      *
  88      * Also, on-fail defaults to block (in unpack_operation()) for stop actions
  89      * when stonith is disabled.
  90      *
  91      * Ideally, we'd unpack the operation before this point, and pass in a
  92      * meta-attributes table that takes all that into consideration.
  93      */
  94     char *xpath = crm_strdup_printf("//" XML_CIB_TAG_RESOURCE
  95                                     "[@" XML_ATTR_ID "='%s']"
  96                                     "//" XML_ATTR_OP
  97                                     "[@" XML_OP_ATTR_ON_FAIL "='block']",
  98                                     xml_name);
  99 
 100     xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
 101     gboolean should_block = FALSE;
 102 
 103     free(xpath);
 104 
 105     if (xpathObj) {
 106         int max = numXpathResults(xpathObj);
 107         int lpc = 0;
 108 
 109         for (lpc = 0; lpc < max; lpc++) {
 110             xmlNode *pref = getXpathResult(xpathObj, lpc);
 111 
 112             if (xml_op) {
 113                 should_block = is_matched_failure(xml_name, pref, xml_op);
 114                 if (should_block) {
 115                     break;
 116                 }
 117 
 118             } else {
 119                 const char *conf_op_name = NULL;
 120                 const char *conf_op_interval_spec = NULL;
 121                 guint conf_op_interval_ms = 0;
 122                 char *lrm_op_xpath = NULL;
 123                 xmlXPathObject *lrm_op_xpathObj = NULL;
 124 
 125                 // Get name and interval from configured op
 126                 conf_op_name = crm_element_value(pref, "name");
 127                 conf_op_interval_spec = crm_element_value(pref, XML_LRM_ATTR_INTERVAL);
 128                 conf_op_interval_ms = crm_parse_interval_spec(conf_op_interval_spec);
 129 
 130 #define XPATH_FMT "//" XML_CIB_TAG_STATE "[@" XML_ATTR_UNAME "='%s']"       \
 131                   "//" XML_LRM_TAG_RESOURCE "[@" XML_ATTR_ID "='%s']"       \
 132                   "/" XML_LRM_TAG_RSC_OP "[@" XML_LRM_ATTR_TASK "='%s']"    \
 133                   "[@" XML_LRM_ATTR_INTERVAL "='%u']"
 134 
 135                 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
 136                                                  node->details->uname, xml_name,
 137                                                  conf_op_name,
 138                                                  conf_op_interval_ms);
 139                 lrm_op_xpathObj = xpath_search(rsc->cluster->input, lrm_op_xpath);
 140 
 141                 free(lrm_op_xpath);
 142 
 143                 if (lrm_op_xpathObj) {
 144                     int max2 = numXpathResults(lrm_op_xpathObj);
 145                     int lpc2 = 0;
 146 
 147                     for (lpc2 = 0; lpc2 < max2; lpc2++) {
 148                         xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
 149                                                              lpc2);
 150 
 151                         should_block = is_matched_failure(xml_name, pref,
 152                                                           lrm_op_xml);
 153                         if (should_block) {
 154                             break;
 155                         }
 156                     }
 157                 }
 158                 freeXpathObject(lrm_op_xpathObj);
 159 
 160                 if (should_block) {
 161                     break;
 162                 }
 163             }
 164         }
 165     }
 166 
 167     free(xml_name);
 168     freeXpathObject(xpathObj);
 169 
 170     return should_block;
 171 }
 172 
 173 /*!
 174  * \internal
 175  * \brief Get resource name as used in failure-related node attributes
 176  *
 177  * \param[in] rsc  Resource to check
 178  *
 179  * \return Newly allocated string containing resource's fail name
 180  * \note The caller is responsible for freeing the result.
 181  */
 182 static inline char *
 183 rsc_fail_name(const pe_resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
 184 {
 185     const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
 186 
 187     return pcmk_is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
 188 }
 189 
 190 /*!
 191  * \internal
 192  * \brief Compile regular expression to match a failure-related node attribute
 193  *
 194  * \param[in]  prefix    Attribute prefix to match
 195  * \param[in]  rsc_name  Resource name to match as used in failure attributes
 196  * \param[in]  is_legacy Whether DC uses per-resource fail counts
 197  * \param[in]  is_unique Whether the resource is a globally unique clone
 198  * \param[out] re        Where to store resulting regular expression
 199  *
 200  * \return Standard Pacemaker return code
 201  * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL.
 202  *       The caller is responsible for freeing re with regfree().
 203  */
 204 static int
 205 generate_fail_regex(const char *prefix, const char *rsc_name,
     /* [previous][next][first][last][top][bottom][index][help] */
 206                     gboolean is_legacy, gboolean is_unique, regex_t *re)
 207 {
 208     char *pattern;
 209 
 210     /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
 211      * per-operation.
 212      */
 213     const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
 214 
 215     /* Ignore instance numbers for anything other than globally unique clones.
 216      * Anonymous clone fail counts could contain an instance number if the
 217      * clone was initially unique, failed, then was converted to anonymous.
 218      * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
 219      * clone instance numbers.
 220      */
 221     const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
 222 
 223     pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
 224                                 instance_pattern, op_pattern);
 225     if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
 226         free(pattern);
 227         return EINVAL;
 228     }
 229 
 230     free(pattern);
 231     return pcmk_rc_ok;
 232 }
 233 
 234 /*!
 235  * \internal
 236  * \brief Compile regular expressions to match failure-related node attributes
 237  *
 238  * \param[in]  rsc             Resource being checked for failures
 239  * \param[in]  data_set        Data set (for CRM feature set version)
 240  * \param[out] failcount_re    Storage for regular expression for fail count
 241  * \param[out] lastfailure_re  Storage for regular expression for last failure
 242  *
 243  * \return Standard Pacemaker return code
 244  * \note On success, the caller is responsible for freeing the expressions with
 245  *       regfree().
 246  */
 247 static int
 248 generate_fail_regexes(const pe_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
 249                       const pe_working_set_t *data_set,
 250                       regex_t *failcount_re, regex_t *lastfailure_re)
 251 {
 252     char *rsc_name = rsc_fail_name(rsc);
 253     const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
 254     gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
 255     int rc = pcmk_rc_ok;
 256 
 257     if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name, is_legacy,
 258                             pcmk_is_set(rsc->flags, pe_rsc_unique),
 259                             failcount_re) != pcmk_rc_ok) {
 260         rc = EINVAL;
 261 
 262     } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
 263                                    is_legacy,
 264                                    pcmk_is_set(rsc->flags, pe_rsc_unique),
 265                                    lastfailure_re) != pcmk_rc_ok) {
 266         rc = EINVAL;
 267         regfree(failcount_re);
 268     }
 269 
 270     free(rsc_name);
 271     return rc;
 272 }
 273 
 274 int
 275 pe_get_failcount(const pe_node_t *node, pe_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
 276                  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
 277 {
 278     char *key = NULL;
 279     const char *value = NULL;
 280     regex_t failcount_re, lastfailure_re;
 281     int failcount = 0;
 282     time_t last = 0;
 283     GHashTableIter iter;
 284 
 285     CRM_CHECK(generate_fail_regexes(rsc, rsc->cluster, &failcount_re,
 286                                     &lastfailure_re) == pcmk_rc_ok,
 287               return 0);
 288 
 289     /* Resource fail count is sum of all matching operation fail counts */
 290     g_hash_table_iter_init(&iter, node->details->attrs);
 291     while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
 292         if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
 293             failcount = pcmk__add_scores(failcount, char2score(value));
 294             crm_trace("Added %s (%s) to %s fail count (now %s)",
 295                       key, value, rsc->id, pcmk_readable_score(failcount));
 296         } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
 297             long long last_ll;
 298 
 299             if (pcmk__scan_ll(value, &last_ll, 0LL) == pcmk_rc_ok) {
 300                 last = (time_t) QB_MAX(last, last_ll);
 301             }
 302         }
 303     }
 304 
 305     regfree(&failcount_re);
 306     regfree(&lastfailure_re);
 307 
 308     if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
 309         *last_failure = last;
 310     }
 311 
 312     /* If failure blocks the resource, disregard any failure timeout */
 313     if ((failcount > 0) && rsc->failure_timeout
 314         && block_failure(node, rsc, xml_op)) {
 315 
 316         pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
 317                 rsc->failure_timeout, rsc->id);
 318         rsc->failure_timeout = 0;
 319     }
 320 
 321     /* If all failures have expired, ignore fail count */
 322     if (pcmk_is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
 323         && rsc->failure_timeout) {
 324 
 325         time_t now = get_effective_time(rsc->cluster);
 326 
 327         if (now > (last + rsc->failure_timeout)) {
 328             crm_debug("Failcount for %s on %s expired after %ds",
 329                       rsc->id, pe__node_name(node), rsc->failure_timeout);
 330             failcount = 0;
 331         }
 332     }
 333 
 334     /* We never want the fail counts of a bundle container's fillers to
 335      * count towards the container's fail count.
 336      *
 337      * Most importantly, a Pacemaker Remote connection to a bundle container
 338      * is a filler of the container, but can reside on a different node than the
 339      * container itself. Counting its fail count on its node towards the
 340      * container's fail count on that node could lead to attempting to stop the
 341      * container on the wrong node.
 342      */
 343 
 344     if (pcmk_is_set(flags, pe_fc_fillers) && rsc->fillers
 345         && !pe_rsc_is_bundled(rsc)) {
 346 
 347         GList *gIter = NULL;
 348 
 349         for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
 350             pe_resource_t *filler = (pe_resource_t *) gIter->data;
 351             time_t filler_last_failure = 0;
 352 
 353             failcount += pe_get_failcount(node, filler, &filler_last_failure,
 354                                           flags, xml_op);
 355 
 356             if (last_failure && filler_last_failure > *last_failure) {
 357                 *last_failure = filler_last_failure;
 358             }
 359         }
 360 
 361         if (failcount > 0) {
 362             crm_info("Container %s and the resources within it "
 363                      "have failed %s time%s on %s",
 364                      rsc->id, pcmk_readable_score(failcount),
 365                      pcmk__plural_s(failcount), pe__node_name(node));
 366         }
 367 
 368     } else if (failcount > 0) {
 369         crm_info("%s has failed %s time%s on %s",
 370                  rsc->id, pcmk_readable_score(failcount),
 371                  pcmk__plural_s(failcount), pe__node_name(node));
 372     }
 373 
 374     return failcount;
 375 }
 376 
 377 /*!
 378  * \brief Schedule a controller operation to clear a fail count
 379  *
 380  * \param[in,out] rsc       Resource with failure
 381  * \param[in]     node      Node failure occurred on
 382  * \param[in]     reason    Readable description why needed (for logging)
 383  * \param[in,out] data_set  Working set for cluster
 384  *
 385  * \return Scheduled action
 386  */
 387 pe_action_t *
 388 pe__clear_failcount(pe_resource_t *rsc, const pe_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
 389                     const char *reason, pe_working_set_t *data_set)
 390 {
 391     char *key = NULL;
 392     pe_action_t *clear = NULL;
 393 
 394     CRM_CHECK(rsc && node && reason && data_set, return NULL);
 395 
 396     key = pcmk__op_key(rsc->id, CRM_OP_CLEAR_FAILCOUNT, 0);
 397     clear = custom_action(rsc, key, CRM_OP_CLEAR_FAILCOUNT, node, FALSE, TRUE,
 398                           data_set);
 399     add_hash_param(clear->meta, XML_ATTR_TE_NOWAIT, XML_BOOLEAN_TRUE);
 400     crm_notice("Clearing failure of %s on %s because %s " CRM_XS " %s",
 401                rsc->id, pe__node_name(node), reason, clear->uuid);
 402     return clear;
 403 }

/* [previous][next][first][last][top][bottom][index][help] */