root/daemons/controld/controld_te_events.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fail_incompletable_actions
  2. update_failcount
  3. controld_get_action
  4. get_cancel_action
  5. confirm_cancel_action
  6. match_down_event
  7. process_graph_event

   1 /*
   2  * Copyright 2004-2022 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/param.h>
  13 #include <crm/crm.h>
  14 #include <crm/cib.h>
  15 #include <crm/msg_xml.h>
  16 #include <crm/common/xml.h>
  17 
  18 #include <pacemaker-controld.h>
  19 
  20 #include <crm/common/attrd_internal.h>
  21 #include <crm/common/ipc_attrd_internal.h>
  22 
  23 char *failed_stop_offset = NULL;
  24 char *failed_start_offset = NULL;
  25 
  26 gboolean
  27 fail_incompletable_actions(pcmk__graph_t *graph, const char *down_node)
     /* [previous][next][first][last][top][bottom][index][help] */
  28 {
  29     const char *target_uuid = NULL;
  30     const char *router = NULL;
  31     const char *router_uuid = NULL;
  32     xmlNode *last_action = NULL;
  33 
  34     GList *gIter = NULL;
  35     GList *gIter2 = NULL;
  36 
  37     if (graph == NULL || graph->complete) {
  38         return FALSE;
  39     }
  40 
  41     gIter = graph->synapses;
  42     for (; gIter != NULL; gIter = gIter->next) {
  43         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data;
  44 
  45         if (pcmk_any_flags_set(synapse->flags, pcmk__synapse_confirmed|pcmk__synapse_failed)) {
  46             /* We've already been here */
  47             continue;
  48         }
  49 
  50         gIter2 = synapse->actions;
  51         for (; gIter2 != NULL; gIter2 = gIter2->next) {
  52             pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
  53 
  54             if ((action->type == pcmk__pseudo_graph_action)
  55                 || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
  56                 continue;
  57             } else if (action->type == pcmk__cluster_graph_action) {
  58                 const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
  59 
  60                 if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
  61                     continue;
  62                 }
  63             }
  64 
  65             target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
  66             router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
  67             if (router) {
  68                 crm_node_t *node = crm_get_peer(0, router);
  69                 if (node) {
  70                     router_uuid = node->uuid;
  71                 }
  72             }
  73 
  74             if (pcmk__str_eq(target_uuid, down_node, pcmk__str_casei) || pcmk__str_eq(router_uuid, down_node, pcmk__str_casei)) {
  75                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
  76                 pcmk__set_synapse_flags(synapse, pcmk__synapse_failed);
  77                 last_action = action->xml;
  78                 stop_te_timer(action);
  79                 pcmk__update_graph(graph, action);
  80 
  81                 if (pcmk_is_set(synapse->flags, pcmk__synapse_executed)) {
  82                     crm_notice("Action %d (%s) was pending on %s (offline)",
  83                                action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
  84                 } else {
  85                     crm_info("Action %d (%s) is scheduled for %s (offline)",
  86                              action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
  87                 }
  88             }
  89         }
  90     }
  91 
  92     if (last_action != NULL) {
  93         crm_info("Node %s shutdown resulted in un-runnable actions", down_node);
  94         abort_transition(INFINITY, pcmk__graph_restart, "Node failure",
  95                          last_action);
  96         return TRUE;
  97     }
  98 
  99     return FALSE;
 100 }
 101 
 102 /*!
 103  * \internal
 104  * \brief Update failure-related node attributes if warranted
 105  *
 106  * \param[in] event            XML describing operation that (maybe) failed
 107  * \param[in] event_node_uuid  Node that event occurred on
 108  * \param[in] rc               Actual operation return code
 109  * \param[in] target_rc        Expected operation return code
 110  * \param[in] do_update        If TRUE, do update regardless of operation type
 111  * \param[in] ignore_failures  If TRUE, update last failure but not fail count
 112  *
 113  * \return TRUE if this was not a direct nack, success or lrm status refresh
 114  */
 115 static gboolean
 116 update_failcount(const xmlNode *event, const char *event_node_uuid, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 117                  int target_rc, gboolean do_update, gboolean ignore_failures)
 118 {
 119     guint interval_ms = 0;
 120 
 121     char *task = NULL;
 122     char *rsc_id = NULL;
 123 
 124     const char *value = NULL;
 125     const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 126     const char *on_uname = crm_peer_uname(event_node_uuid);
 127     const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);
 128 
 129     // Nothing needs to be done for success or status refresh
 130     if (rc == target_rc) {
 131         return FALSE;
 132     } else if (pcmk__str_eq(origin, "build_active_RAs", pcmk__str_casei)) {
 133         crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
 134                   id, rc, on_uname);
 135         return FALSE;
 136     }
 137 
 138     /* Sanity check */
 139     CRM_CHECK(on_uname != NULL, return TRUE);
 140     CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval_ms),
 141               crm_err("Couldn't parse: %s", ID(event)); goto bail);
 142 
 143     /* Decide whether update is necessary and what value to use */
 144     if ((interval_ms > 0) || pcmk__str_eq(task, CRMD_ACTION_PROMOTE, pcmk__str_casei)
 145         || pcmk__str_eq(task, CRMD_ACTION_DEMOTE, pcmk__str_casei)) {
 146         do_update = TRUE;
 147 
 148     } else if (pcmk__str_eq(task, CRMD_ACTION_START, pcmk__str_casei)) {
 149         do_update = TRUE;
 150         if (failed_start_offset == NULL) {
 151             failed_start_offset = strdup(CRM_INFINITY_S);
 152         }
 153         value = failed_start_offset;
 154 
 155     } else if (pcmk__str_eq(task, CRMD_ACTION_STOP, pcmk__str_casei)) {
 156         do_update = TRUE;
 157         if (failed_stop_offset == NULL) {
 158             failed_stop_offset = strdup(CRM_INFINITY_S);
 159         }
 160         value = failed_stop_offset;
 161     }
 162 
 163     /* Fail count will be either incremented or set to infinity */
 164     if (!pcmk_str_is_infinity(value)) {
 165         value = XML_NVPAIR_ATTR_VALUE "++";
 166     }
 167 
 168     if (do_update) {
 169         pcmk__attrd_query_pair_t *fail_pair = NULL;
 170         pcmk__attrd_query_pair_t *last_pair = NULL;
 171         char *fail_name = NULL;
 172         char *last_name = NULL;
 173         GList *attrs = NULL;
 174 
 175         uint32_t opts = pcmk__node_attr_none;
 176 
 177         char *now = pcmk__ttoa(time(NULL));
 178 
 179         if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
 180             opts |= pcmk__node_attr_remote;
 181         }
 182 
 183         crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)",
 184                  (ignore_failures? "last failure" : "failcount"),
 185                  rsc_id, on_uname, task, rc, value, now);
 186 
 187         /* Update the fail count, if we're not ignoring failures */
 188         if (!ignore_failures) {
 189             fail_pair = calloc(1, sizeof(pcmk__attrd_query_pair_t));
 190             CRM_ASSERT(fail_pair != NULL);
 191 
 192             fail_name = pcmk__failcount_name(rsc_id, task, interval_ms);
 193             fail_pair->name = fail_name;
 194             fail_pair->value = value;
 195             fail_pair->node = on_uname;
 196 
 197             attrs = g_list_prepend(attrs, fail_pair);
 198         }
 199 
 200         /* Update the last failure time (even if we're ignoring failures,
 201          * so that failure can still be detected and shown, e.g. by crm_mon)
 202          */
 203         last_pair = calloc(1, sizeof(pcmk__attrd_query_pair_t));
 204         CRM_ASSERT(last_pair != NULL);
 205 
 206         last_name = pcmk__lastfailure_name(rsc_id, task, interval_ms);
 207         last_pair->name = last_name;
 208         last_pair->value = now;
 209         last_pair->node = on_uname;
 210 
 211         attrs = g_list_prepend(attrs, last_pair);
 212 
 213         update_attrd_list(attrs, opts);
 214 
 215         if (!ignore_failures) {
 216             free(fail_name);
 217             free(fail_pair);
 218         }
 219 
 220         free(last_name);
 221         free(last_pair);
 222         g_list_free(attrs);
 223 
 224         free(now);
 225     }
 226 
 227   bail:
 228     free(rsc_id);
 229     free(task);
 230     return TRUE;
 231 }
 232 
 233 pcmk__graph_action_t *
 234 controld_get_action(int id)
     /* [previous][next][first][last][top][bottom][index][help] */
 235 {
 236     for (GList *item = transition_graph->synapses; item; item = item->next) {
 237         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) item->data;
 238 
 239         for (GList *item2 = synapse->actions; item2; item2 = item2->next) {
 240             pcmk__graph_action_t *action = (pcmk__graph_action_t *) item2->data;
 241 
 242             if (action->id == id) {
 243                 return action;
 244             }
 245         }
 246     }
 247     return NULL;
 248 }
 249 
 250 pcmk__graph_action_t *
 251 get_cancel_action(const char *id, const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 252 {
 253     GList *gIter = NULL;
 254     GList *gIter2 = NULL;
 255 
 256     gIter = transition_graph->synapses;
 257     for (; gIter != NULL; gIter = gIter->next) {
 258         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) gIter->data;
 259 
 260         gIter2 = synapse->actions;
 261         for (; gIter2 != NULL; gIter2 = gIter2->next) {
 262             const char *task = NULL;
 263             const char *target = NULL;
 264             pcmk__graph_action_t *action = (pcmk__graph_action_t *) gIter2->data;
 265 
 266             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 267             if (!pcmk__str_eq(CRMD_ACTION_CANCEL, task, pcmk__str_casei)) {
 268                 continue;
 269             }
 270 
 271             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
 272             if (!pcmk__str_eq(task, id, pcmk__str_casei)) {
 273                 crm_trace("Wrong key %s for %s on %s", task, id, node);
 274                 continue;
 275             }
 276 
 277             target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 278             if (node && !pcmk__str_eq(target, node, pcmk__str_casei)) {
 279                 crm_trace("Wrong node %s for %s on %s", target, id, node);
 280                 continue;
 281             }
 282 
 283             crm_trace("Found %s on %s", id, node);
 284             return action;
 285         }
 286     }
 287 
 288     return NULL;
 289 }
 290 
 291 bool
 292 confirm_cancel_action(const char *id, const char *node_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 293 {
 294     const char *op_key = NULL;
 295     const char *node_name = NULL;
 296     pcmk__graph_action_t *cancel = get_cancel_action(id, node_id);
 297 
 298     if (cancel == NULL) {
 299         return FALSE;
 300     }
 301     op_key = crm_element_value(cancel->xml, XML_LRM_ATTR_TASK_KEY);
 302     node_name = crm_element_value(cancel->xml, XML_LRM_ATTR_TARGET);
 303 
 304     stop_te_timer(cancel);
 305     te_action_confirmed(cancel, transition_graph);
 306 
 307     crm_info("Cancellation of %s on %s confirmed (action %d)",
 308              op_key, node_name, cancel->id);
 309     return TRUE;
 310 }
 311 
 312 /* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
 313 #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \
 314                      "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']"
 315 
 316 /*!
 317  * \brief Find a transition event that would have made a specified node down
 318  *
 319  * \param[in] target  UUID of node to match
 320  *
 321  * \return Matching event if found, NULL otherwise
 322  */
 323 pcmk__graph_action_t *
 324 match_down_event(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 325 {
 326     pcmk__graph_action_t *match = NULL;
 327     xmlXPathObjectPtr xpath_ret = NULL;
 328     GList *gIter, *gIter2;
 329 
 330     char *xpath = crm_strdup_printf(XPATH_DOWNED, target);
 331 
 332     for (gIter = transition_graph->synapses;
 333          gIter != NULL && match == NULL;
 334          gIter = gIter->next) {
 335 
 336         for (gIter2 = ((pcmk__graph_synapse_t * ) gIter->data)->actions;
 337              gIter2 != NULL && match == NULL;
 338              gIter2 = gIter2->next) {
 339 
 340             match = (pcmk__graph_action_t *) gIter2->data;
 341             if (pcmk_is_set(match->flags, pcmk__graph_action_executed)) {
 342                 xpath_ret = xpath_search(match->xml, xpath);
 343                 if (numXpathResults(xpath_ret) < 1) {
 344                     match = NULL;
 345                 }
 346                 freeXpathObject(xpath_ret);
 347             } else {
 348                 // Only actions that were actually started can match
 349                 match = NULL;
 350             }
 351         }
 352     }
 353 
 354     free(xpath);
 355 
 356     if (match != NULL) {
 357         crm_debug("Shutdown action %d (%s) found for node %s", match->id,
 358                   crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY), target);
 359     } else {
 360         crm_debug("No reason to expect node %s to be down", target);
 361     }
 362     return match;
 363 }
 364 
 365 void
 366 process_graph_event(xmlNode *event, const char *event_node)
     /* [previous][next][first][last][top][bottom][index][help] */
 367 {
 368     int rc = -1;                // Actual result
 369     int target_rc = -1;         // Expected result
 370     int status = -1;            // Executor status
 371     int callid = -1;            // Executor call ID
 372     int transition_num = -1;    // Transition number
 373     int action_num = -1;        // Action number within transition
 374     char *update_te_uuid = NULL;
 375     bool ignore_failures = FALSE;
 376     const char *id = NULL;
 377     const char *desc = NULL;
 378     const char *magic = NULL;
 379     const char *uname = NULL;
 380 
 381     CRM_ASSERT(event != NULL);
 382 
 383 /*
 384 <lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
 385 */
 386 
 387     magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY);
 388     if (magic == NULL) {
 389         /* non-change */
 390         return;
 391     }
 392 
 393     crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status);
 394     if (status == PCMK_EXEC_PENDING) {
 395         return;
 396     }
 397 
 398     id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 399     crm_element_value_int(event, XML_LRM_ATTR_RC, &rc);
 400     crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid);
 401 
 402     rc = pcmk__effective_rc(rc);
 403 
 404     if (decode_transition_key(magic, &update_te_uuid, &transition_num,
 405                               &action_num, &target_rc) == FALSE) {
 406         // decode_transition_key() already logged the bad key
 407         crm_err("Can't process action %s result: Incompatible versions? "
 408                 CRM_XS " call-id=%d", id, callid);
 409         abort_transition(INFINITY, pcmk__graph_restart, "Bad event", event);
 410         return;
 411     }
 412 
 413     if (transition_num == -1) {
 414         // E.g. crm_resource --fail
 415         desc = "initiated outside of the cluster";
 416         abort_transition(INFINITY, pcmk__graph_restart, "Unexpected event",
 417                          event);
 418 
 419     } else if ((action_num < 0) || !pcmk__str_eq(update_te_uuid, te_uuid, pcmk__str_none)) {
 420         desc = "initiated by a different DC";
 421         abort_transition(INFINITY, pcmk__graph_restart, "Foreign event", event);
 422 
 423     } else if ((transition_graph->id != transition_num)
 424                || transition_graph->complete) {
 425 
 426         // Action is not from currently active transition
 427 
 428         guint interval_ms = 0;
 429 
 430         if (parse_op_key(id, NULL, NULL, &interval_ms)
 431             && (interval_ms != 0)) {
 432             /* Recurring actions have the transition number they were first
 433              * scheduled in.
 434              */
 435 
 436             if (status == PCMK_EXEC_CANCELLED) {
 437                 confirm_cancel_action(id, get_node_id(event));
 438                 goto bail;
 439             }
 440 
 441             desc = "arrived after initial scheduling";
 442             abort_transition(INFINITY, pcmk__graph_restart,
 443                              "Change in recurring result", event);
 444 
 445         } else if (transition_graph->id != transition_num) {
 446             desc = "arrived really late";
 447             abort_transition(INFINITY, pcmk__graph_restart, "Old event", event);
 448         } else {
 449             desc = "arrived late";
 450             abort_transition(INFINITY, pcmk__graph_restart, "Inactive graph",
 451                              event);
 452         }
 453 
 454     } else {
 455         // Event is result of an action from currently active transition
 456         pcmk__graph_action_t *action = controld_get_action(action_num);
 457 
 458         if (action == NULL) {
 459             // Should never happen
 460             desc = "unknown";
 461             abort_transition(INFINITY, pcmk__graph_restart, "Unknown event",
 462                              event);
 463 
 464         } else if (pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 465             /* Nothing further needs to be done if the action has already been
 466              * confirmed. This can happen e.g. when processing both an
 467              * "xxx_last_0" or "xxx_last_failure_0" record as well as the main
 468              * history record, which would otherwise result in incorrectly
 469              * bumping the fail count twice.
 470              */
 471             crm_log_xml_debug(event, "Event already confirmed:");
 472             goto bail;
 473 
 474         } else {
 475             /* An action result needs to be confirmed.
 476              * (This is the only case where desc == NULL.)
 477              */
 478 
 479             if (pcmk__str_eq(crm_meta_value(action->params, XML_OP_ATTR_ON_FAIL), "ignore", pcmk__str_casei)) {
 480                 ignore_failures = TRUE;
 481 
 482             } else if (rc != target_rc) {
 483                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 484             }
 485 
 486             stop_te_timer(action);
 487             te_action_confirmed(action, transition_graph);
 488 
 489             if (pcmk_is_set(action->flags, pcmk__graph_action_failed)) {
 490                 abort_transition(action->synapse->priority + 1,
 491                                  pcmk__graph_restart, "Event failed", event);
 492             }
 493         }
 494     }
 495 
 496     if (id == NULL) {
 497         id = "unknown action";
 498     }
 499     uname = crm_element_value(event, XML_LRM_ATTR_TARGET);
 500     if (uname == NULL) {
 501         uname = "unknown node";
 502     }
 503 
 504     if (status == PCMK_EXEC_INVALID) {
 505         // We couldn't attempt the action
 506         crm_info("Transition %d action %d (%s on %s): %s",
 507                  transition_num, action_num, id, uname,
 508                  pcmk_exec_status_str(status));
 509 
 510     } else if (desc && update_failcount(event, event_node, rc, target_rc,
 511                                         (transition_num == -1), FALSE)) {
 512         crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' "
 513                    CRM_XS " target-rc=%d rc=%d call-id=%d event='%s'",
 514                    transition_num, action_num, id, uname,
 515                    services_ocf_exitcode_str(target_rc),
 516                    services_ocf_exitcode_str(rc),
 517                    target_rc, rc, callid, desc);
 518 
 519     } else if (desc) {
 520         crm_info("Transition %d action %d (%s on %s): %s "
 521                  CRM_XS " rc=%d target-rc=%d call-id=%d",
 522                  transition_num, action_num, id, uname,
 523                  desc, rc, target_rc, callid);
 524 
 525     } else if (rc == target_rc) {
 526         crm_info("Transition %d action %d (%s on %s) confirmed: %s "
 527                  CRM_XS " rc=%d call-id=%d",
 528                  transition_num, action_num, id, uname,
 529                  services_ocf_exitcode_str(rc), rc, callid);
 530 
 531     } else {
 532         update_failcount(event, event_node, rc, target_rc,
 533                          (transition_num == -1), ignore_failures);
 534         crm_notice("Transition %d action %d (%s on %s): expected '%s' but got '%s' "
 535                    CRM_XS " target-rc=%d rc=%d call-id=%d",
 536                    transition_num, action_num, id, uname,
 537                    services_ocf_exitcode_str(target_rc),
 538                    services_ocf_exitcode_str(rc),
 539                    target_rc, rc, callid);
 540     }
 541 
 542   bail:
 543     free(update_te_uuid);
 544 }

/* [previous][next][first][last][top][bottom][index][help] */