root/lib/pengine/failcounts.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. is_matched_failure
  2. block_failure
  3. rsc_fail_name
  4. generate_fail_regex
  5. generate_fail_regexes
  6. pe_get_failcount

   1 /*
   2  * Copyright (C) 2008-2017 Andrew Beekhof <andrew@beekhof.net>
   3  *
   4  * This source code is licensed under the GNU Lesser General Public License
   5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   6  */
   7 
   8 #include <crm_internal.h>
   9 
  10 #include <sys/types.h>
  11 #include <regex.h>
  12 #include <glib.h>
  13 
  14 #include <crm/crm.h>
  15 #include <crm/msg_xml.h>
  16 #include <crm/common/xml.h>
  17 #include <crm/common/util.h>
  18 #include <crm/pengine/internal.h>
  19 
  20 static gboolean
  21 is_matched_failure(const char *rsc_id, xmlNode *conf_op_xml,
     /* [previous][next][first][last][top][bottom][index][help] */
  22                    xmlNode *lrm_op_xml)
  23 {
  24     gboolean matched = FALSE;
  25     const char *conf_op_name = NULL;
  26     int conf_op_interval = 0;
  27     const char *lrm_op_task = NULL;
  28     int lrm_op_interval = 0;
  29     const char *lrm_op_id = NULL;
  30     char *last_failure_key = NULL;
  31 
  32     if (rsc_id == NULL || conf_op_xml == NULL || lrm_op_xml == NULL) {
  33         return FALSE;
  34     }
  35 
  36     conf_op_name = crm_element_value(conf_op_xml, "name");
  37     conf_op_interval = crm_get_msec(crm_element_value(conf_op_xml, "interval"));
  38     lrm_op_task = crm_element_value(lrm_op_xml, XML_LRM_ATTR_TASK);
  39     crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_INTERVAL, &lrm_op_interval);
  40 
  41     if (safe_str_eq(conf_op_name, lrm_op_task) == FALSE
  42         || conf_op_interval != lrm_op_interval) {
  43         return FALSE;
  44     }
  45 
  46     lrm_op_id = ID(lrm_op_xml);
  47     last_failure_key = generate_op_key(rsc_id, "last_failure", 0);
  48 
  49     if (safe_str_eq(last_failure_key, lrm_op_id)) {
  50         matched = TRUE;
  51 
  52     } else {
  53         char *expected_op_key = generate_op_key(rsc_id, conf_op_name,
  54                                                 conf_op_interval);
  55 
  56         if (safe_str_eq(expected_op_key, lrm_op_id)) {
  57             int rc = 0;
  58             int target_rc = get_target_rc(lrm_op_xml);
  59 
  60             crm_element_value_int(lrm_op_xml, XML_LRM_ATTR_RC, &rc);
  61             if (rc != target_rc) {
  62                 matched = TRUE;
  63             }
  64         }
  65         free(expected_op_key);
  66     }
  67 
  68     free(last_failure_key);
  69     return matched;
  70 }
  71 
  72 static gboolean
  73 block_failure(node_t *node, resource_t *rsc, xmlNode *xml_op,
     /* [previous][next][first][last][top][bottom][index][help] */
  74               pe_working_set_t *data_set)
  75 {
  76     char *xml_name = clone_strip(rsc->id);
  77     char *xpath = crm_strdup_printf("//primitive[@id='%s']//op[@on-fail='block']",
  78                                     xml_name);
  79     xmlXPathObject *xpathObj = xpath_search(rsc->xml, xpath);
  80     gboolean should_block = FALSE;
  81 
  82     free(xpath);
  83 
  84 #if 0
  85     /* A good idea? */
  86     if (rsc->container == NULL && is_not_set(data_set->flags, pe_flag_stonith_enabled)) {
  87         /* In this case, stop on-fail defaults to block in unpack_operation() */
  88         return TRUE;
  89     }
  90 #endif
  91 
  92     if (xpathObj) {
  93         int max = numXpathResults(xpathObj);
  94         int lpc = 0;
  95 
  96         for (lpc = 0; lpc < max; lpc++) {
  97             xmlNode *pref = getXpathResult(xpathObj, lpc);
  98 
  99             if (xml_op) {
 100                 should_block = is_matched_failure(xml_name, pref, xml_op);
 101                 if (should_block) {
 102                     break;
 103                 }
 104 
 105             } else {
 106                 const char *conf_op_name = NULL;
 107                 int conf_op_interval = 0;
 108                 char *lrm_op_xpath = NULL;
 109                 xmlXPathObject *lrm_op_xpathObj = NULL;
 110 
 111                 conf_op_name = crm_element_value(pref, "name");
 112                 conf_op_interval = crm_get_msec(crm_element_value(pref, "interval"));
 113 
 114                 lrm_op_xpath = crm_strdup_printf("//node_state[@uname='%s']"
 115                                                "//lrm_resource[@id='%s']"
 116                                                "/lrm_rsc_op[@operation='%s'][@interval='%d']",
 117                                                node->details->uname, xml_name,
 118                                                conf_op_name, conf_op_interval);
 119                 lrm_op_xpathObj = xpath_search(data_set->input, lrm_op_xpath);
 120 
 121                 free(lrm_op_xpath);
 122 
 123                 if (lrm_op_xpathObj) {
 124                     int max2 = numXpathResults(lrm_op_xpathObj);
 125                     int lpc2 = 0;
 126 
 127                     for (lpc2 = 0; lpc2 < max2; lpc2++) {
 128                         xmlNode *lrm_op_xml = getXpathResult(lrm_op_xpathObj,
 129                                                              lpc2);
 130 
 131                         should_block = is_matched_failure(xml_name, pref,
 132                                                           lrm_op_xml);
 133                         if (should_block) {
 134                             break;
 135                         }
 136                     }
 137                 }
 138                 freeXpathObject(lrm_op_xpathObj);
 139 
 140                 if (should_block) {
 141                     break;
 142                 }
 143             }
 144         }
 145     }
 146 
 147     free(xml_name);
 148     freeXpathObject(xpathObj);
 149 
 150     return should_block;
 151 }
 152 
 153 /*!
 154  * \internal
 155  * \brief Get resource name as used in failure-related node attributes
 156  *
 157  * \param[in] rsc  Resource to check
 158  *
 159  * \return Newly allocated string containing resource's fail name
 160  * \note The caller is responsible for freeing the result.
 161  */
 162 static inline char *
 163 rsc_fail_name(resource_t *rsc)
     /* [previous][next][first][last][top][bottom][index][help] */
 164 {
 165     const char *name = (rsc->clone_name? rsc->clone_name : rsc->id);
 166 
 167     return is_set(rsc->flags, pe_rsc_unique)? strdup(name) : clone_strip(name);
 168 }
 169 
 170 /*!
 171  * \internal
 172  * \brief Compile regular expression to match a failure-related node attribute
 173  *
 174  * \param[in]  prefix    Attribute prefix to match
 175  * \param[in]  rsc_name  Resource name to match as used in failure attributes
 176  * \param[in]  is_legacy Whether DC uses per-resource fail counts
 177  * \param[in]  is_unique Whether the resource is a globally unique clone
 178  * \param[out] re        Where to store resulting regular expression
 179  *
 180  * \note Fail attributes are named like PREFIX-RESOURCE#OP_INTERVAL.
 181  *       The caller is responsible for freeing re with regfree().
 182  */
 183 static void
 184 generate_fail_regex(const char *prefix, const char *rsc_name,
     /* [previous][next][first][last][top][bottom][index][help] */
 185                     gboolean is_legacy, gboolean is_unique, regex_t *re)
 186 {
 187     char *pattern;
 188 
 189     /* @COMPAT DC < 1.1.17: Fail counts used to be per-resource rather than
 190      * per-operation.
 191      */
 192     const char *op_pattern = (is_legacy? "" : "#.+_[0-9]+");
 193 
 194     /* Ignore instance numbers for anything other than globally unique clones.
 195      * Anonymous clone fail counts could contain an instance number if the
 196      * clone was initially unique, failed, then was converted to anonymous.
 197      * @COMPAT Also, before 1.1.8, anonymous clone fail counts always contained
 198      * clone instance numbers.
 199      */
 200     const char *instance_pattern = (is_unique? "" : "(:[0-9]+)?");
 201 
 202     pattern = crm_strdup_printf("^%s-%s%s%s$", prefix, rsc_name,
 203                                 instance_pattern, op_pattern);
 204     CRM_LOG_ASSERT(regcomp(re, pattern, REG_EXTENDED|REG_NOSUB) == 0);
 205     free(pattern);
 206 }
 207 
 208 /*!
 209  * \internal
 210  * \brief Compile regular expressions to match failure-related node attributes
 211  *
 212  * \param[in]  rsc             Resource being checked for failures
 213  * \param[in]  data_set        Data set (for CRM feature set version)
 214  * \param[out] failcount_re    Storage for regular expression for fail count
 215  * \param[out] lastfailure_re  Storage for regular expression for last failure
 216  *
 217  * \note The caller is responsible for freeing the expressions with regfree().
 218  */
 219 static void
 220 generate_fail_regexes(resource_t *rsc, pe_working_set_t *data_set,
     /* [previous][next][first][last][top][bottom][index][help] */
 221                       regex_t *failcount_re, regex_t *lastfailure_re)
 222 {
 223     char *rsc_name = rsc_fail_name(rsc);
 224     const char *version = crm_element_value(data_set->input, XML_ATTR_CRM_VERSION);
 225     gboolean is_legacy = (compare_version(version, "3.0.13") < 0);
 226 
 227     generate_fail_regex(CRM_FAIL_COUNT_PREFIX, rsc_name, is_legacy,
 228                         is_set(rsc->flags, pe_rsc_unique), failcount_re);
 229 
 230     generate_fail_regex(CRM_LAST_FAILURE_PREFIX, rsc_name, is_legacy,
 231                         is_set(rsc->flags, pe_rsc_unique), lastfailure_re);
 232 
 233     free(rsc_name);
 234 }
 235 
 236 int
 237 pe_get_failcount(node_t *node, resource_t *rsc, time_t *last_failure,
     /* [previous][next][first][last][top][bottom][index][help] */
 238                  uint32_t flags, xmlNode *xml_op, pe_working_set_t *data_set)
 239 {
 240     char *key = NULL;
 241     const char *value = NULL;
 242     regex_t failcount_re, lastfailure_re;
 243     int failcount = 0;
 244     time_t last = 0;
 245     GHashTableIter iter;
 246 
 247     generate_fail_regexes(rsc, data_set, &failcount_re, &lastfailure_re);
 248 
 249     /* Resource fail count is sum of all matching operation fail counts */
 250     g_hash_table_iter_init(&iter, node->details->attrs);
 251     while (g_hash_table_iter_next(&iter, (gpointer *) &key, (gpointer *) &value)) {
 252         if (regexec(&failcount_re, key, 0, NULL, 0) == 0) {
 253             failcount = merge_weights(failcount, char2score(value));
 254         } else if (regexec(&lastfailure_re, key, 0, NULL, 0) == 0) {
 255             last = QB_MAX(last, crm_int_helper(value, NULL));
 256         }
 257     }
 258 
 259     regfree(&failcount_re);
 260     regfree(&lastfailure_re);
 261 
 262     if ((failcount > 0) && (last > 0) && (last_failure != NULL)) {
 263         *last_failure = last;
 264     }
 265 
 266     /* If failure blocks the resource, disregard any failure timeout */
 267     if ((failcount > 0) && rsc->failure_timeout
 268         && block_failure(node, rsc, xml_op, data_set)) {
 269 
 270         pe_warn("Ignoring failure timeout %d for %s because it conflicts with on-fail=block",
 271                 rsc->id, rsc->failure_timeout);
 272         rsc->failure_timeout = 0;
 273     }
 274 
 275     /* If all failures have expired, ignore fail count */
 276     if (is_set(flags, pe_fc_effective) && (failcount > 0) && (last > 0)
 277         && rsc->failure_timeout) {
 278 
 279         time_t now = get_effective_time(data_set);
 280 
 281         if (now > (last + rsc->failure_timeout)) {
 282             crm_debug("Failcount for %s on %s expired after %ds",
 283                       rsc->id, node->details->uname, rsc->failure_timeout);
 284             failcount = 0;
 285         }
 286     }
 287 
 288     if (is_set(flags, pe_fc_fillers) && rsc->fillers) {
 289         GListPtr gIter = NULL;
 290 
 291         for (gIter = rsc->fillers; gIter != NULL; gIter = gIter->next) {
 292             resource_t *filler = (resource_t *) gIter->data;
 293             time_t filler_last_failure = 0;
 294 
 295             failcount += pe_get_failcount(node, filler, &filler_last_failure,
 296                                           flags, xml_op, data_set);
 297 
 298             if (last_failure && filler_last_failure > *last_failure) {
 299                 *last_failure = filler_last_failure;
 300             }
 301         }
 302 
 303         if (failcount > 0) {
 304             char *score = score2char(failcount);
 305 
 306             crm_info("Container %s and the resources within it have failed %s times on %s",
 307                      rsc->id, score, node->details->uname);
 308             free(score);
 309         }
 310 
 311     } else if (failcount > 0) {
 312         char *score = score2char(failcount);
 313 
 314         crm_info("%s has failed %s times on %s",
 315                  rsc->id, score, node->details->uname);
 316         free(score);
 317     }
 318 
 319 
 320     return failcount;
 321 }

/* [previous][next][first][last][top][bottom][index][help] */