root/lib/pengine/failcounts.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_matched_failure
  2. block_failure
  3. rsc_fail_name
  4. generate_fail_regex
  5. generate_fail_regexes
  6. update_failcount_for_attr
  7. update_launched_failcount
  8. pe_get_failcount
  9. pe__clear_failcount

   1 /*
   2  * Copyright 2008-2025 the Pacemaker project contributors
   3  *
   4  * This source code is licensed under the GNU Lesser General Public License
   5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   6  */
   7 
   8 #include <crm_internal.h>
   9 
  10 #include <sys/types.h>
  11 #include <regex.h>
  12 
  13 #include <glib.h>
  14 #include <libxml/xpath.h>           // xmlXPathObject, etc.
  15 
  16 #include <crm/crm.h>
  17 #include <crm/common/xml.h>
  18 #include <crm/common/util.h>
  19 #include <crm/pengine/internal.h>
  20 
  21 static gboolean
  22 is_matched_failure(const char *rsc_id, const xmlNode *conf_op_xml,
     /* [previous][next][first][last][top][bottom][index][help] */
  23                    const xmlNode *lrm_op_xml)
  24 {
  25     gboolean matched = FALSE;
  26     const char *conf_op_name = NULL;
  27     const char *lrm_op_task = NULL;
  28     const char *conf_op_interval_spec = NULL;
  29     guint conf_op_interval_ms = 0;
  30     guint lrm_op_interval_ms = 0;
  31     const char *lrm_op_id = NULL;
  32     char *last_failure_key = NULL;
  33 
  34     if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
  35         return FALSE;
  36     }
  37 
  38     // Get name and interval from configured op
  39     conf_op_name = crm_element_value(conf_op_xml, PCMK_XA_NAME);
  40     conf_op_interval_spec = crm_element_value(conf_op_xml, PCMK_META_INTERVAL);
  41     pcmk_parse_interval_spec(conf_op_interval_spec, &conf_op_interval_ms);
  42 
  43     // Get name and interval from op history entry
  44     lrm_op_task = crm_element_value(lrm_op_xml, PCMK_XA_OPERATION);
  45     crm_element_value_ms(lrm_op_xml, PCMK_META_INTERVAL, &lrm_op_interval_ms);
  46 
  47     if ((conf_op_interval_ms != lrm_op_interval_ms)
  48         || !pcmk__str_eq(conf_op_name, lrm_op_task, pcmk__str_casei)) {
  49         return FALSE;
  50     }
  51 
  52     lrm_op_id = pcmk__xe_id(lrm_op_xml);
  53     last_failure_key = pcmk__op_key(rsc_id, "last_failure", 0);
  54 
  55     if (pcmk__str_eq(last_failure_key, lrm_op_id, pcmk__str_casei)) {
  56         matched = TRUE;
  57 
  58     } else {
  59         char *expected_op_key = pcmk__op_key(rsc_id, conf_op_name,
  60                                                 conf_op_interval_ms);
  61 
  62         if (pcmk__str_eq(expected_op_key, lrm_op_id, pcmk__str_casei)) {
  63             int rc = 0;
  64             int target_rc = pe__target_rc_from_xml(lrm_op_xml);
  65 
  66             crm_element_value_int(lrm_op_xml, PCMK__XA_RC_CODE, &rc);
  67             if (rc != target_rc) {
  68                 matched = TRUE;
  69             }
  70         }
  71         free(expected_op_key);
  72     }
  73 
  74     free(last_failure_key);
  75     return matched;
  76 }
  77 
  78 static gboolean
  79 block_failure(const pcmk_node_t *node, pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
  80               const xmlNode *xml_op)
  81 {
  82     char *xml_name = clone_strip(rsc->id);
  83 
  84     /* @TODO This xpath search occurs after template expansion, but it is unable
  85      * to properly detect on-fail in id-ref, operation meta-attributes, or
  86      * op_defaults, or evaluate rules.
  87      *
  88      * Also, PCMK_META_ON_FAIL defaults to PCMK_VALUE_BLOCK (in
  89      * unpack_operation()) for stop actions when stonith is disabled.
  90      *
  91      * Ideally, we'd unpack the operation before this point, and pass in a
  92      * meta-attributes table that takes all that into consideration.
  93      */
  94     char *xpath = crm_strdup_printf("//" PCMK_XE_PRIMITIVE
  95                                     "[@" PCMK_XA_ID "='%s']"
  96                                     "//" PCMK_XE_OP
  97                                     "[@" PCMK_META_ON_FAIL
  98                                         "='" PCMK_VALUE_BLOCK "']",
  99                                     xml_name);
 100 
 101     xmlXPathObject *xpathObj = pcmk__xpath_search(rsc->priv->xml->doc, xpath);
 102     gboolean should_block = FALSE;
 103 
 104     free(xpath);
 105 
 106     if (xpathObj) {
 107         int max = pcmk__xpath_num_results(xpathObj);
 108         int lpc = 0;
 109 
 110         for (lpc = 0; lpc < max; lpc++) {
 111             xmlNode *pref = pcmk__xpath_result(xpathObj, lpc);
 112 
 113             if (xml_op) {
 114                 should_block = is_matched_failure(xml_name, pref, xml_op);
 115                 if (should_block) {
 116                     break;
 117                 }
 118 
 119             } else {
 120                 const char *conf_op_name = NULL;
 121                 const char *conf_op_interval_spec = NULL;
 122                 guint conf_op_interval_ms = 0;
 123                 pcmk_scheduler_t *scheduler = rsc->priv->scheduler;
 124                 char *lrm_op_xpath = NULL;
 125                 xmlXPathObject *lrm_op_xpathObj = NULL;
 126 
 127                 // Get name and interval from configured op
 128                 conf_op_name = crm_element_value(pref, PCMK_XA_NAME);
 129                 conf_op_interval_spec = crm_element_value(pref,
 130                                                           PCMK_META_INTERVAL);
 131                 pcmk_parse_interval_spec(conf_op_interval_spec,
 132                                          &conf_op_interval_ms);
 133 
 134 #define XPATH_FMT "//" PCMK__XE_NODE_STATE "[@" PCMK_XA_UNAME "='%s']"      \
 135                   "//" PCMK__XE_LRM_RESOURCE "[@" PCMK_XA_ID "='%s']"       \
 136                   "/" PCMK__XE_LRM_RSC_OP "[@" PCMK_XA_OPERATION "='%s']"   \
 137                   "[@" PCMK_META_INTERVAL "='%u']"
 138 
 139                 lrm_op_xpath = crm_strdup_printf(XPATH_FMT,
 140                                                  node->priv->name, xml_name,
 141                                                  conf_op_name,
 142                                                  conf_op_interval_ms);
 143                 lrm_op_xpathObj = pcmk__xpath_search(scheduler->input->doc,
 144                                                      lrm_op_xpath);
 145 
 146                 free(lrm_op_xpath);
 147 
 148                 if (lrm_op_xpathObj) {
 149                     int max2 = pcmk__xpath_num_results(lrm_op_xpathObj);
 150                     int lpc2 = 0;
 151 
 152                     for (lpc2 = 0; lpc2 < max2; lpc2++) {
 153                         xmlNode *lrm_op_xml = NULL;
 154 
 155                         lrm_op_xml = pcmk__xpath_result(lrm_op_xpathObj, lpc2);
 156                         should_block = is_matched_failure(xml_name, pref,
 157                                                           lrm_op_xml);
 158                         if (should_block) {
 159                             break;
 160                         }
 161                     }
 162                 }
 163                 xmlXPathFreeObject(lrm_op_xpathObj);
 164 
 165                 if (should_block) {
 166                     break;
 167                 }
 168             }
 169         }
 170     }
 171 
 172     free(xml_name);
 173     xmlXPathFreeObject(xpathObj);
 174 
 175     return should_block;
 176 }
 177 
 178 /*!
 179  * \internal
 180  * \brief Get resource name as used in failure-related node attributes
 181  *
 182  * \param[in] rsc  Resource to check
 183  *
 184  * \return Newly allocated string containing resource's fail name
 185  * \note The caller is responsible for freeing the result.
 186  */
 187 static inline char *
 188 rsc_fail_name(const pcmk_resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
 189 {
 190     const char *name = pcmk__s(rsc->priv->history_id, rsc->id);
 191 
 192     return pcmk_is_set(rsc->flags, pcmk__rsc_unique)? strdup(name) : clone_strip(name);
 193 }
 194 
 195 /*!
 196  * \internal
 197  * \brief Compile regular expression to match a failure-related node attribute
 198  *
 199  * \param[in]  prefix    Attribute prefix to match
 200  * \param[in]  rsc_name  Resource name to match as used in failure attributes
 201  * \param[in]  is_unique Whether the resource is a globally unique clone
 202  * \param[out] re        Where to store resulting regular expression
 203  *
 204  * \return Standard Pacemaker return code
 205  * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL.
 206  *       The caller is responsible for freeing re with regfree().
 207  */
 208 static int
 209 generate_fail_regex(const char *prefix, const char *rsc_name, bool is_unique,
     /* [previous][next][first][last][top][bottom][index][help] */
 210                     regex_t *re)
 211 {
 212     char *pattern = NULL;
 213     const char *op_pattern = "#.+_[0-9]+";
 214 
 215     /* Ignore instance numbers for anything other than globally unique clones.
 216      * Anonymous clone fail counts could contain an instance number if the
 217      * clone was initially unique, failed, then was converted to anonymous.
 218      */
 219     const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
 220 
 221     pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
 222                                 instance_pattern, op_pattern);
 223     if (regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) != 0) {
 224         free(pattern);
 225         return EINVAL;
 226     }
 227 
 228     free(pattern);
 229     return pcmk_rc_ok;
 230 }
 231 
 232 /*!
 233  * \internal
 234  * \brief Compile regular expressions to match failure-related node attributes
 235  *
 236  * \param[in]  rsc             Resource being checked for failures
 237  * \param[out] failcount_re    Storage for regular expression for fail count
 238  * \param[out] lastfailure_re  Storage for regular expression for last failure
 239  *
 240  * \return Standard Pacemaker return code
 241  * \note On success, the caller is responsible for freeing the expressions with
 242  *       regfree().
 243  */
 244 static int
 245 generate_fail_regexes(const pcmk_resource_t *rsc, regex_t *failcount_re,
     /* [previous][next][first][last][top][bottom][index][help] */
 246                       regex_t *lastfailure_re)
 247 {
 248     int rc = pcmk_rc_ok;
 249     char *rsc_name = rsc_fail_name(rsc);
 250 
 251     if (generate_fail_regex(PCMK__FAIL_COUNT_PREFIX, rsc_name,
 252                             pcmk_is_set(rsc->flags, pcmk__rsc_unique),
 253                             failcount_re) != pcmk_rc_ok) {
 254         rc = EINVAL;
 255 
 256     } else if (generate_fail_regex(PCMK__LAST_FAILURE_PREFIX, rsc_name,
 257                                    pcmk_is_set(rsc->flags, pcmk__rsc_unique),
 258                                    lastfailure_re) != pcmk_rc_ok) {
 259         rc = EINVAL;
 260         regfree(failcount_re);
 261     }
 262 
 263     free(rsc_name);
 264     return rc;
 265 }
 266 
 267 // Data for fail-count-related iterators
 268 struct failcount_data {
 269     const pcmk_node_t *node;// Node to check for fail count
 270     pcmk_resource_t *rsc;     // Resource to check for fail count
 271     uint32_t flags;         // Fail count flags
 272     const xmlNode *xml_op;  // History entry for expiration purposes (or NULL)
 273     regex_t failcount_re;   // Fail count regular expression to match
 274     regex_t lastfailure_re; // Last failure regular expression to match
 275     int failcount;          // Fail count so far
 276     time_t last_failure;    // Time of most recent failure so far
 277 };
 278 
 279 /*!
 280  * \internal
 281  * \brief Update fail count and last failure appropriately for a node attribute
 282  *
 283  * \param[in] key        Node attribute name
 284  * \param[in] value      Node attribute value
 285  * \param[in] user_data  Fail count data to update
 286  */
 287 static void
 288 update_failcount_for_attr(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 289 {
 290     struct failcount_data *fc_data = user_data;
 291 
 292     // If this is a matching fail count attribute, update fail count
 293     if (regexec(&(fc_data->failcount_re), (const char *) key, 0, NULL, 0) == 0) {
 294         int score = 0;
 295         int rc = pcmk_parse_score(value, &score, 0);
 296 
 297         if (rc != pcmk_rc_ok) {
 298             crm_warn("Ignoring %s for %s "
 299                      "because '%s' is not a valid fail count: %s",
 300                      (const char *) key, pcmk__node_name(fc_data->node),
 301                      value, pcmk_rc_str(rc));
 302             return;
 303         }
 304         fc_data->failcount = pcmk__add_scores(fc_data->failcount, score);
 305         pcmk__rsc_trace(fc_data->rsc, "Added %s (%s) to %s fail count (now %s)",
 306                         (const char *) key, (const char *) value,
 307                         fc_data->rsc->id,
 308                         pcmk_readable_score(fc_data->failcount));
 309         return;
 310     }
 311 
 312     // If this is a matching last failure attribute, update last failure
 313     if (regexec(&(fc_data->lastfailure_re), (const char *) key, 0, NULL,
 314                 0) == 0) {
 315         long long last_ll;
 316         int rc = pcmk__scan_ll(value, &last_ll, 0LL);
 317 
 318         if (rc != pcmk_rc_ok) {
 319             crm_info("Ignoring invalid value '%s' for %s: %s",
 320                      (const char *) value, (const char *) key, pcmk_rc_str(rc));
 321             return;
 322         }
 323         fc_data->last_failure = (time_t) QB_MAX(fc_data->last_failure, last_ll);
 324     }
 325 }
 326 
 327 /*!
 328  * \internal
 329  * \brief Update fail count and last failure appropriately for launched resource
 330  *
 331  * \param[in] data       Launched resource
 332  * \param[in] user_data  Fail count data to update
 333  */
 334 static void
 335 update_launched_failcount(gpointer data, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 336 {
 337     pcmk_resource_t *launched = data;
 338     struct failcount_data *fc_data = user_data;
 339     time_t launched_last_failure = 0;
 340 
 341     fc_data->failcount += pe_get_failcount(fc_data->node, launched,
 342                                            &launched_last_failure,
 343                                            fc_data->flags, fc_data->xml_op);
 344     fc_data->last_failure = QB_MAX(fc_data->last_failure, launched_last_failure);
 345 }
 346 
 347 #define readable_expiration(rsc)    \
 348     pcmk__readable_interval((rsc)->priv->failure_expiration_ms)
 349 
 350 /*!
 351  * \internal
 352  * \brief Get a resource's fail count on a node
 353  *
 354  * \param[in]     node          Node to check
 355  * \param[in,out] rsc           Resource to check
 356  * \param[out]    last_failure  If not NULL, where to set time of most recent
 357  *                              failure of \p rsc on \p node
 358  * \param[in]     flags         Group of enum pcmk__fc_flags
 359  * \param[in]     xml_op        If not NULL, consider only the action in this
 360  *                              history entry when determining whether on-fail
 361  *                              is configured as "blocked", otherwise consider
 362  *                              all actions configured for \p rsc
 363  *
 364  * \return Fail count for \p rsc on \p node according to \p flags
 365  */
 366 int
 367 pe_get_failcount(const pcmk_node_t *node, pcmk_resource_t *rsc,
     /* [previous][next][first][last][top][bottom][index][help] */
 368                  time_t *last_failure, uint32_t flags, const xmlNode *xml_op)
 369 {
 370     struct failcount_data fc_data = {
 371         .node = node,
 372         .rsc = rsc,
 373         .flags = flags,
 374         .xml_op = xml_op,
 375         .failcount = 0,
 376         .last_failure = (time_t) 0,
 377     };
 378 
 379     // Calculate resource failcount as sum of all matching operation failcounts
 380     CRM_CHECK(generate_fail_regexes(rsc, &fc_data.failcount_re,
 381                                     &fc_data.lastfailure_re) == pcmk_rc_ok,
 382               return 0);
 383     g_hash_table_foreach(node->priv->attrs, update_failcount_for_attr,
 384                          &fc_data);
 385     regfree(&(fc_data.failcount_re));
 386     regfree(&(fc_data.lastfailure_re));
 387 
 388     // If failure blocks the resource, disregard any failure timeout
 389     if ((fc_data.failcount > 0) && (rsc->priv->failure_expiration_ms > 0)
 390         && block_failure(node, rsc, xml_op)) {
 391 
 392         pcmk__config_warn("Ignoring failure timeout (%s) for %s "
 393                           "because it conflicts with "
 394                           PCMK_META_ON_FAIL "=" PCMK_VALUE_BLOCK,
 395                           readable_expiration(rsc), rsc->id);
 396         rsc->priv->failure_expiration_ms = 0;
 397     }
 398 
 399     // If all failures have expired, ignore fail count
 400     if (pcmk_is_set(flags, pcmk__fc_effective) && (fc_data.failcount > 0)
 401         && (fc_data.last_failure > 0)
 402         && (rsc->priv->failure_expiration_ms > 0)) {
 403 
 404         time_t now = pcmk__scheduler_epoch_time(rsc->priv->scheduler);
 405         const guint expiration = pcmk__timeout_ms2s(rsc->priv->failure_expiration_ms);
 406 
 407         if (now > (fc_data.last_failure + expiration)) {
 408             pcmk__rsc_debug(rsc, "Failcount for %s on %s expired after %s",
 409                             rsc->id, pcmk__node_name(node),
 410                             readable_expiration(rsc));
 411             fc_data.failcount = 0;
 412         }
 413     }
 414 
 415     /* Add the fail count of any launched resources, except that we never want
 416      * the fail counts of a bundle container's launched resources to count
 417      * towards the container's fail count.
 418      *
 419      * Most importantly, a Pacemaker Remote connection to a bundle container
 420      * is launched by the container, but can reside on a different node than the
 421      * container itself. Counting its fail count on its node towards the
 422      * container's fail count on that node could lead to attempting to stop the
 423      * container on the wrong node.
 424      */
 425     if (pcmk_is_set(flags, pcmk__fc_launched)
 426         && (rsc->priv->launched != NULL) && !pcmk__is_bundled(rsc)) {
 427 
 428         g_list_foreach(rsc->priv->launched, update_launched_failcount,
 429                        &fc_data);
 430         if (fc_data.failcount > 0) {
 431             pcmk__rsc_info(rsc,
 432                            "Container %s and the resources within it "
 433                            "have failed %s time%s on %s",
 434                            rsc->id, pcmk_readable_score(fc_data.failcount),
 435                            pcmk__plural_s(fc_data.failcount),
 436                            pcmk__node_name(node));
 437         }
 438 
 439     } else if (fc_data.failcount > 0) {
 440         pcmk__rsc_info(rsc, "%s has failed %s time%s on %s",
 441                        rsc->id, pcmk_readable_score(fc_data.failcount),
 442                        pcmk__plural_s(fc_data.failcount),
 443                        pcmk__node_name(node));
 444     }
 445 
 446     if (last_failure != NULL) {
 447         if ((fc_data.failcount > 0) && (fc_data.last_failure > 0)) {
 448             *last_failure = fc_data.last_failure;
 449         } else  {
 450             *last_failure = 0;
 451         }
 452     }
 453     return fc_data.failcount;
 454 }
 455 
 456 /*!
 457  * \brief Schedule a controller operation to clear a fail count
 458  *
 459  * \param[in,out] rsc        Resource with failure
 460  * \param[in]     node       Node failure occurred on
 461  * \param[in]     reason     Readable description why needed (for logging)
 462  * \param[in,out] scheduler  Scheduler data cluster
 463  *
 464  * \return Scheduled action
 465  */
 466 pcmk_action_t *
 467 pe__clear_failcount(pcmk_resource_t *rsc, const pcmk_node_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
 468                     const char *reason, pcmk_scheduler_t *scheduler)
 469 {
 470     char *key = NULL;
 471     pcmk_action_t *clear = NULL;
 472 
 473     CRM_CHECK(rsc && node && reason && scheduler, return NULL);
 474 
 475     key = pcmk__op_key(rsc->id, PCMK_ACTION_CLEAR_FAILCOUNT, 0);
 476     clear = custom_action(rsc, key, PCMK_ACTION_CLEAR_FAILCOUNT, node, FALSE,
 477                           scheduler);
 478     pcmk__insert_meta(clear, PCMK__META_OP_NO_WAIT, PCMK_VALUE_TRUE);
 479     crm_notice("Clearing failure of %s on %s because %s " QB_XS " %s",
 480                rsc->id, pcmk__node_name(node), reason, clear->uuid);
 481     return clear;
 482 }

/* [previous][next][first][last][top][bottom][index][help] */