root/daemons/controld/controld_te_events.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fail_incompletable_actions
  2. update_failcount
  3. controld_get_action
  4. get_cancel_action
  5. confirm_cancel_action
  6. match_down_event
  7. process_graph_event

   1 /*
   2  * Copyright 2004-2021 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/param.h>
  13 #include <crm/crm.h>
  14 #include <crm/cib.h>
  15 #include <crm/msg_xml.h>
  16 #include <crm/common/xml.h>
  17 
  18 #include <pacemaker-controld.h>
  19 
  20 char *failed_stop_offset = NULL;
  21 char *failed_start_offset = NULL;
  22 
  23 gboolean
  24 fail_incompletable_actions(crm_graph_t * graph, const char *down_node)
     /* [previous][next][first][last][top][bottom][index][help] */
  25 {
  26     const char *target_uuid = NULL;
  27     const char *router = NULL;
  28     const char *router_uuid = NULL;
  29     xmlNode *last_action = NULL;
  30 
  31     GList *gIter = NULL;
  32     GList *gIter2 = NULL;
  33 
  34     if (graph == NULL || graph->complete) {
  35         return FALSE;
  36     }
  37 
  38     gIter = graph->synapses;
  39     for (; gIter != NULL; gIter = gIter->next) {
  40         synapse_t *synapse = (synapse_t *) gIter->data;
  41 
  42         if (synapse->confirmed || synapse->failed) {
  43             /* We've already been here */
  44             continue;
  45         }
  46 
  47         gIter2 = synapse->actions;
  48         for (; gIter2 != NULL; gIter2 = gIter2->next) {
  49             crm_action_t *action = (crm_action_t *) gIter2->data;
  50 
  51             if (action->type == action_type_pseudo || action->confirmed) {
  52                 continue;
  53             } else if (action->type == action_type_crm) {
  54                 const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
  55 
  56                 if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
  57                     continue;
  58                 }
  59             }
  60 
  61             target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
  62             router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
  63             if (router) {
  64                 crm_node_t *node = crm_get_peer(0, router);
  65                 if (node) {
  66                     router_uuid = node->uuid;
  67                 }
  68             }
  69 
  70             if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) {
  71                 action->failed = TRUE;
  72                 synapse->failed = TRUE;
  73                 last_action = action->xml;
  74                 stop_te_timer(action->timer);
  75                 update_graph(graph, action);
  76 
  77                 if (synapse->executed) {
  78                     crm_notice("Action %d (%s) was pending on %s (offline)",
  79                                action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
  80                 } else {
  81                     crm_info("Action %d (%s) is scheduled for %s (offline)",
  82                              action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
  83                 }
  84             }
  85         }
  86     }
  87 
  88     if (last_action != NULL) {
  89         crm_info("Node %s shutdown resulted in un-runnable actions", down_node);
  90         abort_transition(INFINITY, tg_restart, "Node failure", last_action);
  91         return TRUE;
  92     }
  93 
  94     return FALSE;
  95 }
  96 
  97 /*!
  98  * \internal
  99  * \brief Update failure-related node attributes if warranted
 100  *
 101  * \param[in] event            XML describing operation that (maybe) failed
 102  * \param[in] event_node_uuid  Node that event occurred on
 103  * \param[in] rc               Actual operation return code
 104  * \param[in] target_rc        Expected operation return code
 105  * \param[in] do_update        If TRUE, do update regardless of operation type
 106  * \param[in] ignore_failures  If TRUE, update last failure but not fail count
 107  *
 108  * \return TRUE if this was not a direct nack, success or lrm status refresh
 109  */
 110 static gboolean
 111 update_failcount(xmlNode * event, const char *event_node_uuid, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 112                  int target_rc, gboolean do_update, gboolean ignore_failures)
 113 {
 114     guint interval_ms = 0;
 115 
 116     char *task = NULL;
 117     char *rsc_id = NULL;
 118 
 119     const char *value = NULL;
 120     const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 121     const char *on_uname = crm_peer_uname(event_node_uuid);
 122     const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);
 123 
 124     // Nothing needs to be done for success or status refresh
 125     if (rc == target_rc) {
 126         return FALSE;
 127     } else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) {
 128         crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
 129                   id, rc, on_uname);
 130         return FALSE;
 131     }
 132 
 133     /* Sanity check */
 134     CRM_CHECK(on_uname != NULL, return TRUE);
 135     CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms),
 136               crm_err("Couldn't parse: %s", ID(event)); goto bail);
 137 
 138     /* Decide whether update is necessary and what value to use */
 139     if ((interval_ms > 0) || pcmk__str_eq(task, CRMD_ACTION_PROMOTE, pcmk__str_casei)
 140         || pcmk__str_eq(task, CRMD_ACTION_DEMOTE, pcmk__str_casei)) {
 141         do_update = TRUE;
 142 
 143     } else if (pcmk__str_eq(task, CRMD_ACTION_START, pcmk__str_casei)) {
 144         do_update = TRUE;
 145         if (failed_start_offset == NULL) {
 146             failed_start_offset = strdup(CRM_INFINITY_S);
 147         }
 148         value = failed_start_offset;
 149 
 150     } else if (pcmk__str_eq(task, CRMD_ACTION_STOP, pcmk__str_casei)) {
 151         do_update = TRUE;
 152         if (failed_stop_offset == NULL) {
 153             failed_stop_offset = strdup(CRM_INFINITY_S);
 154         }
 155         value = failed_stop_offset;
 156     }
 157 
 158     /* Fail count will be either incremented or set to infinity */
 159     if (!pcmk_str_is_infinity(value)) {
 160         value = XML_NVPAIR_ATTR_VALUE "++";
 161     }
 162 
 163     if (do_update) {
 164         char *now = pcmk__ttoa(time(NULL));
 165         char *attr_name = NULL;
 166         gboolean is_remote_node = FALSE;
 167 
 168         if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
 169             is_remote_node = TRUE;
 170         }
 171 
 172         crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)",
 173                  (ignore_failures? "last failure" : "failcount"),
 174                  rsc_id, on_uname, task, rc, value, now);
 175 
 176         /* Update the fail count, if we're not ignoring failures */
 177         if (!ignore_failures) {
 178             attr_name = pcmk__failcount_name(rsc_id, task, interval_ms);
 179             update_attrd(on_uname, attr_name, value, NULL, is_remote_node);
 180             free(attr_name);
 181         }
 182 
 183         /* Update the last failure time (even if we're ignoring failures,
 184          * so that failure can still be detected and shown, e.g. by crm_mon)
 185          */
 186         attr_name = pcmk__lastfailure_name(rsc_id, task, interval_ms);
 187         update_attrd(on_uname, attr_name, now, NULL, is_remote_node);
 188         free(attr_name);
 189 
 190         free(now);
 191     }
 192 
 193   bail:
 194     free(rsc_id);
 195     free(task);
 196     return TRUE;
 197 }
 198 
 199 crm_action_t *
 200 controld_get_action(int id)
     /* [previous][next][first][last][top][bottom][index][help] */
 201 {
 202     for (GList *item = transition_graph->synapses; item; item = item->next) {
 203         synapse_t *synapse = (synapse_t *) item->data;
 204 
 205         for (GList *item2 = synapse->actions; item2; item2 = item2->next) {
 206             crm_action_t *action = (crm_action_t *) item2->data;
 207 
 208             if (action->id == id) {
 209                 return action;
 210             }
 211         }
 212     }
 213     return NULL;
 214 }
 215 
 216 crm_action_t *
 217 get_cancel_action(const char *id, const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 218 {
 219     GList *gIter = NULL;
 220     GList *gIter2 = NULL;
 221 
 222     gIter = transition_graph->synapses;
 223     for (; gIter != NULL; gIter = gIter->next) {
 224         synapse_t *synapse = (synapse_t *) gIter->data;
 225 
 226         gIter2 = synapse->actions;
 227         for (; gIter2 != NULL; gIter2 = gIter2->next) {
 228             const char *task = NULL;
 229             const char *target = NULL;
 230             crm_action_t *action = (crm_action_t *) gIter2->data;
 231 
 232             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 233             if (!pcmk__str_eq(CRMD_ACTION_CANCEL, task, pcmk__str_casei)) {
 234                 continue;
 235             }
 236 
 237             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
 238             if (!pcmk__str_eq(task, id, pcmk__str_casei)) {
 239                 crm_trace("Wrong key %s for %s on %s", task, id, node);
 240                 continue;
 241             }
 242 
 243             target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 244             if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) {
 245                 crm_trace("Wrong node %s for %s on %s", target, id, node);
 246                 continue;
 247             }
 248 
 249             crm_trace("Found %s on %s", id, node);
 250             return action;
 251         }
 252     }
 253 
 254     return NULL;
 255 }
 256 
 257 bool
 258 confirm_cancel_action(const char *id, const char *node_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 259 {
 260     const char *op_key = NULL;
 261     const char *node_name = NULL;
 262     crm_action_t *cancel = get_cancel_action(id, node_id);
 263 
 264     if (cancel == NULL) {
 265         return FALSE;
 266     }
 267     op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY);
 268     node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET);
 269 
 270     stop_te_timer(cancel->timer);
 271     te_action_confirmed(cancel, transition_graph);
 272 
 273     crm_info("Cancellation of %s on %s confirmed (action %d)",
 274              op_key, node_name, cancel->id);
 275     return TRUE;
 276 }
 277 
 278 /* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
 279 #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \
 280                      "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']"
 281 
 282 /*!
 283  * \brief Find a transition event that would have made a specified node down
 284  *
 285  * \param[in] target  UUID of node to match
 286  *
 287  * \return Matching event if found, NULL otherwise
 288  */
 289 crm_action_t *
 290 match_down_event(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 291 {
 292     crm_action_t *match = NULL;
 293     xmlXPathObjectPtr xpath_ret = NULL;
 294     GList *gIter, *gIter2;
 295 
 296     char *xpath = crm_strdup_printf(XPATH_DOWNED, target);
 297 
 298     for (gIter = transition_graph->synapses;
 299          gIter != NULL && match == NULL;
 300          gIter = gIter->next) {
 301 
 302         for (gIter2 = ((synapse_t*)gIter->data)->actions;
 303              gIter2 != NULL && match == NULL;
 304              gIter2 = gIter2->next) {
 305 
 306             match = (crm_action_t*)gIter2->data;
 307             if (match->executed) {
 308                 xpath_ret = xpath_search(match->xml, xpath);
 309                 if (numXpathResults(xpath_ret) < 1) {
 310                     match = NULL;
 311                 }
 312                 freeXpathObject(xpath_ret);
 313             } else {
 314                 // Only actions that were actually started can match
 315                 match = NULL;
 316             }
 317         }
 318     }
 319 
 320     free(xpath);
 321 
 322     if (match != NULL) {
 323         crm_debug("Shutdown action %d (%s) found for node %s", match->id,
 324                   crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target);
 325     } else {
 326         crm_debug("No reason to expect node %s to be down", target);
 327     }
 328     return match;
 329 }
 330 
 331 void
 332 process_graph_event(xmlNode *event, const char *event_node)
     /* [previous][next][first][last][top][bottom][index][help] */
 333 {
 334     int rc = -1;                // Actual result
 335     int target_rc = -1;         // Expected result
 336     int status = -1;            // Executor status
 337     int callid = -1;            // Executor call ID
 338     int transition_num = -1;    // Transition number
 339     int action_num = -1;        // Action number within transition
 340     char *update_te_uuid = NULL;
 341     bool ignore_failures = FALSE;
 342     const char *id = NULL;
 343     const char *desc = NULL;
 344     const char *magic = NULL;
 345     const char *uname = NULL;
 346 
 347     CRM_ASSERT(event != NULL);
 348 
 349 /*
 350 <lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
 351 */
 352 
 353     magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY);
 354     if (magic == NULL) {
 355         /* non-change */
 356         return;
 357     }
 358 
 359     crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status);
 360     if (status == PCMK_LRM_OP_PENDING) {
 361         return;
 362     }
 363 
 364     id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 365     crm_element_value_int(event, XML_LRM_ATTR_RC, &rc);
 366     crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid);
 367 
 368     rc = pcmk__effective_rc(rc);
 369 
 370     if (decode_transition_key(magic, &update_te_uuid, &transition_num,
 371                               &action_num, &target_rc) == FALSE) {
 372         // decode_transition_key() already logged the bad key
 373         crm_err("Can't process action %s result: Incompatible versions? "
 374                 CRM_XS " call-id=%d", id, callid);
 375         abort_transition(INFINITY, tg_restart, "Bad event", event);
 376         return;
 377     }
 378 
 379     if (transition_num == -1) {
 380         // E.g. crm_resource --fail
 381         desc = "initiated outside of the cluster";
 382         abort_transition(INFINITY, tg_restart, "Unexpected event", event);
 383 
 384     } else if ((action_num < 0) || !pcmk__str_eq(update_te_uuid, te_uuid, pcmk__str_none)) {
 385         desc = "initiated by a different DC";
 386         abort_transition(INFINITY, tg_restart, "Foreign event", event);
 387 
 388     } else if ((transition_graph->id != transition_num)
 389                || (transition_graph->complete)) {
 390 
 391         // Action is not from currently active transition
 392 
 393         guint interval_ms = 0;
 394 
 395         if (parse_op_key(id, NULL, NULL, &interval_ms)
 396             && (interval_ms != 0)) {
 397             /* Recurring actions have the transition number they were first
 398              * scheduled in.
 399              */
 400 
 401             if (status == PCMK_LRM_OP_CANCELLED) {
 402                 confirm_cancel_action(id, get_node_id(event));
 403                 goto bail;
 404             }
 405 
 406             desc = "arrived after initial scheduling";
 407             abort_transition(INFINITY, tg_restart, "Change in recurring result",
 408                              event);
 409 
 410         } else if (transition_graph->id != transition_num) {
 411             desc = "arrived really late";
 412             abort_transition(INFINITY, tg_restart, "Old event", event);
 413         } else {
 414             desc = "arrived late";
 415             abort_transition(INFINITY, tg_restart, "Inactive graph", event);
 416         }
 417 
 418     } else {
 419         // Event is result of an action from currently active transition
 420         crm_action_t *action = controld_get_action(action_num);
 421 
 422         if (action == NULL) {
 423             // Should never happen
 424             desc = "unknown";
 425             abort_transition(INFINITY, tg_restart, "Unknown event", event);
 426 
 427         } else if (action->confirmed == TRUE) {
 428             /* Nothing further needs to be done if the action has already been
 429              * confirmed. This can happen e.g. when processing both an
 430              * "xxx_last_0" or "xxx_last_failure_0" record as well as the main
 431              * history record, which would otherwise result in incorrectly
 432              * bumping the fail count twice.
 433              */
 434             crm_log_xml_debug(event, "Event already confirmed:");
 435             goto bail;
 436 
 437         } else {
 438             /* An action result needs to be confirmed.
 439              * (This is the only case where desc == NULL.)
 440              */
 441 
 442             if (pcmk__str_eq(crm_meta_value(action->params, XML_OP_ATTR_ON_FAIL), "ignore", pcmk__str_casei)) {
 443                 ignore_failures = TRUE;
 444 
 445             } else if (rc != target_rc) {
 446                 action->failed = TRUE;
 447             }
 448 
 449             stop_te_timer(action->timer);
 450             te_action_confirmed(action, transition_graph);
 451 
 452             if (action->failed) {
 453                 abort_transition(action->synapse->priority + 1, tg_restart,
 454                                  "Event failed", event);
 455             }
 456         }
 457     }
 458 
 459     if (id == NULL) {
 460         id = "unknown action";
 461     }
 462     uname = crm_element_value(event, XML_LRM_ATTR_TARGET);
 463     if (uname == NULL) {
 464         uname = "unknown node";
 465     }
 466 
 467     if (status == PCMK_LRM_OP_INVALID) {
 468         // We couldn't attempt the action
 469         crm_info("Transition %d action %d (%s on %s): %s",
 470                  transition_num, action_num, id, uname,
 471                  services_lrm_status_str(status));
 472 
 473     } else if (desc && update_failcount(event, event_node, rc, target_rc,
 474                                         (transition_num == -1), FALSE)) {
 475         crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' "
 476                    CRM_XS " target-rc=%d rc=%d call-id=%d event='%s'",
 477                    transition_num, action_num, id, uname,
 478                    services_ocf_exitcode_str(target_rc),
 479                    services_ocf_exitcode_str(rc),
 480                    target_rc, rc, callid, desc);
 481 
 482     } else if (desc) {
 483         crm_info("Transition %d action %d (%s on %s): %s "
 484                  CRM_XS " rc=%d target-rc=%d call-id=%d",
 485                  transition_num, action_num, id, uname,
 486                  desc, rc, target_rc, callid);
 487 
 488     } else if (rc == target_rc) {
 489         crm_info("Transition %d action %d (%s on %s) confirmed: %s "
 490                  CRM_XS " rc=%d call-id=%d",
 491                  transition_num, action_num, id, uname,
 492                  services_ocf_exitcode_str(rc), rc, callid);
 493 
 494     } else {
 495         update_failcount(event, event_node, rc, target_rc,
 496                          (transition_num == -1), ignore_failures);
 497         crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' "
 498                    CRM_XS " target-rc=%d rc=%d call-id=%d",
 499                    transition_num, action_num, id, uname,
 500                    services_ocf_exitcode_str(target_rc),
 501                    services_ocf_exitcode_str(rc),
 502                    target_rc, rc, callid);
 503     }
 504 
 505   bail:
 506     free(update_te_uuid);
 507 }

/* [previous][next][first][last][top][bottom][index][help] */