root/crmd/te_events.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. fail_incompletable_actions
  2. update_failcount
  3. status_from_rc
  4. match_graph_event
  5. get_action
  6. get_cancel_action
  7. match_down_event
  8. process_graph_event

   1 /*
   2  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This software is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 
  19 #include <crm_internal.h>
  20 
  21 #include <sys/param.h>
  22 #include <crm/crm.h>
  23 #include <crm/cib.h>
  24 #include <crm/msg_xml.h>
  25 
  26 #include <crm/common/xml.h>
  27 #include <tengine.h>
  28 
  29 #include <crmd_fsa.h>
  30 
  31 char *failed_stop_offset = NULL;
  32 char *failed_start_offset = NULL;
  33 
  34 gboolean
  35 fail_incompletable_actions(crm_graph_t * graph, const char *down_node)
     /* [previous][next][first][last][top][bottom][index][help] */
  36 {
  37     const char *target_uuid = NULL;
  38     const char *router = NULL;
  39     const char *router_uuid = NULL;
  40     xmlNode *last_action = NULL;
  41 
  42     GListPtr gIter = NULL;
  43     GListPtr gIter2 = NULL;
  44 
  45     if (graph == NULL || graph->complete) {
  46         return FALSE;
  47     }
  48 
  49     gIter = graph->synapses;
  50     for (; gIter != NULL; gIter = gIter->next) {
  51         synapse_t *synapse = (synapse_t *) gIter->data;
  52 
  53         if (synapse->confirmed || synapse->failed) {
  54             /* We've already been here */
  55             continue;
  56         }
  57 
  58         gIter2 = synapse->actions;
  59         for (; gIter2 != NULL; gIter2 = gIter2->next) {
  60             crm_action_t *action = (crm_action_t *) gIter2->data;
  61 
  62             if (action->type == action_type_pseudo || action->confirmed) {
  63                 continue;
  64             } else if (action->type == action_type_crm) {
  65                 const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
  66 
  67                 if (safe_str_eq(task, CRM_OP_FENCE)) {
  68                     continue;
  69                 }
  70             }
  71 
  72             target_uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
  73             router = crm_element_value(action->xml, XML_LRM_ATTR_ROUTER_NODE);
  74             if (router) {
  75                 crm_node_t *node = crm_get_peer(0, router);
  76                 if (node) {
  77                     router_uuid = node->uuid;
  78                 }
  79             }
  80 
  81             if (safe_str_eq(target_uuid, down_node) || safe_str_eq(router_uuid, down_node)) {
  82                 action->failed = TRUE;
  83                 synapse->failed = TRUE;
  84                 last_action = action->xml;
  85                 stop_te_timer(action->timer);
  86                 update_graph(graph, action);
  87 
  88                 if (synapse->executed) {
  89                     crm_notice("Action %d (%s) was pending on %s (offline)",
  90                                action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
  91                 } else {
  92                     crm_info("Action %d (%s) is scheduled for %s (offline)",
  93                              action->id, crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY), down_node);
  94                 }
  95             }
  96         }
  97     }
  98 
  99     if (last_action != NULL) {
 100         crm_info("Node %s shutdown resulted in un-runnable actions", down_node);
 101         abort_transition(INFINITY, tg_restart, "Node failure", last_action);
 102         return TRUE;
 103     }
 104 
 105     return FALSE;
 106 }
 107 
 108 /*!
 109  * \internal
 110  * \brief Update failure-related node attributes if warranted
 111  *
 112  * \param[in] event            XML describing operation that (maybe) failed
 113  * \param[in] event_node_uuid  Node that event occurred on
 114  * \param[in] rc               Actual operation return code
 115  * \param[in] target_rc        Expected operation return code
 116  * \param[in] do_update        If TRUE, do update regardless of operation type
 117  * \param[in] ignore_failures  If TRUE, update last failure but not fail count
 118  *
 119  * \return TRUE if this was not a direct nack, success or lrm status refresh
 120  */
 121 static gboolean
 122 update_failcount(xmlNode * event, const char *event_node_uuid, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 123                  int target_rc, gboolean do_update, gboolean ignore_failures)
 124 {
 125     int interval = 0;
 126 
 127     char *task = NULL;
 128     char *rsc_id = NULL;
 129 
 130     const char *value = NULL;
 131     const char *id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 132     const char *on_uname = crm_peer_uname(event_node_uuid);
 133     const char *origin = crm_element_value(event, XML_ATTR_ORIGIN);
 134 
 135     /* Nothing needs to be done for success, lrm status refresh,
 136      * or direct nack (internal code for "busy, try again")
 137      */
 138     if ((rc == CRM_DIRECT_NACK_RC) || (rc == target_rc)) {
 139         return FALSE;
 140     } else if (safe_str_eq(origin, "build_active_RAs")) {
 141         crm_debug("No update for %s (rc=%d) on %s: Old failure from lrm status refresh",
 142                   id, rc, on_uname);
 143         return FALSE;
 144     }
 145 
 146     /* Sanity check */
 147     CRM_CHECK(on_uname != NULL, return TRUE);
 148     CRM_CHECK(parse_op_key(id, &rsc_id, &task, &interval),
 149               crm_err("Couldn't parse: %s", ID(event)); goto bail);
 150     CRM_CHECK(task != NULL, goto bail);
 151     CRM_CHECK(rsc_id != NULL, goto bail);
 152 
 153     /* Decide whether update is necessary and what value to use */
 154     if ((interval > 0) || safe_str_eq(task, CRMD_ACTION_PROMOTE)
 155         || safe_str_eq(task, CRMD_ACTION_DEMOTE)) {
 156         do_update = TRUE;
 157 
 158     } else if (safe_str_eq(task, CRMD_ACTION_START)) {
 159         do_update = TRUE;
 160         if (failed_start_offset == NULL) {
 161             failed_start_offset = strdup(INFINITY_S);
 162         }
 163         value = failed_start_offset;
 164 
 165     } else if (safe_str_eq(task, CRMD_ACTION_STOP)) {
 166         do_update = TRUE;
 167         if (failed_stop_offset == NULL) {
 168             failed_stop_offset = strdup(INFINITY_S);
 169         }
 170         value = failed_stop_offset;
 171     }
 172 
 173     /* Fail count will be either incremented or set to infinity */
 174     if (value == NULL || safe_str_neq(value, INFINITY_S)) {
 175         value = XML_NVPAIR_ATTR_VALUE "++";
 176     }
 177 
 178     if (do_update) {
 179         char *now = crm_itoa(time(NULL));
 180         char *attr_name = NULL;
 181         gboolean is_remote_node = FALSE;
 182 
 183         if (g_hash_table_lookup(crm_remote_peer_cache, event_node_uuid)) {
 184             is_remote_node = TRUE;
 185         }
 186 
 187         crm_info("Updating %s for %s on %s after failed %s: rc=%d (update=%s, time=%s)",
 188                  (ignore_failures? "last failure" : "failcount"),
 189                  rsc_id, on_uname, task, rc, value, now);
 190 
 191         /* Update the fail count, if we're not ignoring failures */
 192         if (!ignore_failures) {
 193             attr_name = crm_failcount_name(rsc_id, task, interval);
 194             update_attrd(on_uname, attr_name, value, NULL, is_remote_node);
 195             free(attr_name);
 196         }
 197 
 198         /* Update the last failure time (even if we're ignoring failures,
 199          * so that failure can still be detected and shown, e.g. by crm_mon)
 200          */
 201         attr_name = crm_lastfailure_name(rsc_id, task, interval);
 202         update_attrd(on_uname, attr_name, now, NULL, is_remote_node);
 203         free(attr_name);
 204 
 205         free(now);
 206     }
 207 
 208   bail:
 209     free(rsc_id);
 210     free(task);
 211     return TRUE;
 212 }
 213 
 214 /*!
 215  * \internal
 216  * \brief Return simplified operation status based on operation return code
 217  *
 218  * \param[in] action       CRM action instance of operation
 219  * \param[in] orig_status  Original reported operation status
 220  * \param[in] rc           Actual operation return code
 221  * \param[in] target_rc    Expected operation return code
 222  *
 223  * \return PCMK_LRM_OP_DONE if rc equals target_rc, PCMK_LRM_OP_ERROR otherwise
 224  *
 225  * \note This assumes that PCMK_LRM_OP_PENDING operations have already been
 226  *       filtered (otherwise they will get simplified as well).
 227  */
 228 static int
 229 status_from_rc(crm_action_t * action, int orig_status, int rc, int target_rc)
     /* [previous][next][first][last][top][bottom][index][help] */
 230 {
 231     if (target_rc == rc) {
 232         crm_trace("Target rc: == %d", rc);
 233         if (orig_status != PCMK_LRM_OP_DONE) {
 234             crm_trace("Re-mapping op status to PCMK_LRM_OP_DONE for rc=%d", rc);
 235         }
 236         return PCMK_LRM_OP_DONE;
 237     }
 238 
 239     if (rc != CRM_DIRECT_NACK_RC) {
 240         const char *task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
 241         const char *uname = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 242 
 243         crm_warn("Action %d (%s) on %s failed (target: %d vs. rc: %d): %s",
 244                  action->id, task, uname, target_rc, rc,
 245                  services_lrm_status_str(PCMK_LRM_OP_ERROR));
 246     }
 247     return PCMK_LRM_OP_ERROR;
 248 }
 249 
 250 /*!
 251  * \internal
 252  * \brief Confirm action and update transition graph, aborting transition on failures
 253  *
 254  * \param[in,out] action           CRM action instance of this operation
 255  * \param[in]     event            Event instance of this operation
 256  * \param[in]     orig_status      Original reported operation status
 257  * \param[in]     op_rc            Actual operation return code
 258  * \param[in]     target_rc        Expected operation return code
 259  * \param[in]     ignore_failures  Whether to ignore operation failures
 260  *
 261  * \note This assumes that PCMK_LRM_OP_PENDING operations have already been
 262  *       filtered (otherwise they may be treated as failures).
 263  */
 264 static void
 265 match_graph_event(crm_action_t *action, xmlNode *event, int op_status,
     /* [previous][next][first][last][top][bottom][index][help] */
 266                   int op_rc, int target_rc, gboolean ignore_failures)
 267 {
 268     const char *target = NULL;
 269     const char *this_event = NULL;
 270     const char *ignore_s = "";
 271 
 272     /* Remap operation status based on return code */
 273     op_status = status_from_rc(action, op_status, op_rc, target_rc);
 274 
 275     /* Process OP status */
 276     switch (op_status) {
 277         case PCMK_LRM_OP_DONE:
 278             break;
 279         case PCMK_LRM_OP_ERROR:
 280         case PCMK_LRM_OP_TIMEOUT:
 281         case PCMK_LRM_OP_NOTSUPPORTED:
 282             if (ignore_failures) {
 283                 ignore_s = ", ignoring failure";
 284             } else {
 285                 action->failed = TRUE;
 286             }
 287             break;
 288         case PCMK_LRM_OP_CANCELLED:
 289             /* do nothing?? */
 290             crm_err("Don't know what to do for cancelled ops yet");
 291             break;
 292         default:
 293             /*
 294              PCMK_LRM_OP_ERROR_HARD,
 295              PCMK_LRM_OP_ERROR_FATAL,
 296              PCMK_LRM_OP_NOT_INSTALLED
 297              */
 298             action->failed = TRUE;
 299             crm_err("Unsupported action result: %d", op_status);
 300     }
 301 
 302     /* stop this event's timer if it had one */
 303     stop_te_timer(action->timer);
 304     te_action_confirmed(action);
 305 
 306     update_graph(transition_graph, action);
 307     trigger_graph();
 308 
 309     if (action->failed) {
 310         abort_transition(action->synapse->priority + 1, tg_restart, "Event failed", event);
 311     }
 312 
 313     this_event = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 314     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 315     crm_info("Action %s (%d) confirmed on %s (rc=%d%s)",
 316              crm_str(this_event), action->id, crm_str(target), op_rc, ignore_s);
 317 }
 318 
 319 crm_action_t *
 320 get_action(int id, gboolean confirmed)
     /* [previous][next][first][last][top][bottom][index][help] */
 321 {
 322     GListPtr gIter = NULL;
 323     GListPtr gIter2 = NULL;
 324 
 325     gIter = transition_graph->synapses;
 326     for (; gIter != NULL; gIter = gIter->next) {
 327         synapse_t *synapse = (synapse_t *) gIter->data;
 328 
 329         gIter2 = synapse->actions;
 330         for (; gIter2 != NULL; gIter2 = gIter2->next) {
 331             crm_action_t *action = (crm_action_t *) gIter2->data;
 332 
 333             if (action->id == id) {
 334                 if (confirmed) {
 335                     stop_te_timer(action->timer);
 336                     te_action_confirmed(action);
 337                 }
 338                 return action;
 339             }
 340         }
 341     }
 342 
 343     return NULL;
 344 }
 345 
 346 crm_action_t *
 347 get_cancel_action(const char *id, const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 348 {
 349     GListPtr gIter = NULL;
 350     GListPtr gIter2 = NULL;
 351 
 352     gIter = transition_graph->synapses;
 353     for (; gIter != NULL; gIter = gIter->next) {
 354         synapse_t *synapse = (synapse_t *) gIter->data;
 355 
 356         gIter2 = synapse->actions;
 357         for (; gIter2 != NULL; gIter2 = gIter2->next) {
 358             const char *task = NULL;
 359             const char *target = NULL;
 360             crm_action_t *action = (crm_action_t *) gIter2->data;
 361 
 362             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 363             if (safe_str_neq(CRMD_ACTION_CANCEL, task)) {
 364                 continue;
 365             }
 366 
 367             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK_KEY);
 368             if (safe_str_neq(task, id)) {
 369                 crm_trace("Wrong key %s for %s on %s", task, id, node);
 370                 continue;
 371             }
 372 
 373             target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 374             if (node && safe_str_neq(target, node)) {
 375                 crm_trace("Wrong node %s for %s on %s", target, id, node);
 376                 continue;
 377             }
 378 
 379             crm_trace("Found %s on %s", id, node);
 380             return action;
 381         }
 382     }
 383 
 384     return NULL;
 385 }
 386 
 387 /* downed nodes are listed like: <downed> <node id="UUID1" /> ... </downed> */
 388 #define XPATH_DOWNED "//" XML_GRAPH_TAG_DOWNED \
 389                      "/" XML_CIB_TAG_NODE "[@" XML_ATTR_UUID "='%s']"
 390 
 391 /*!
 392  * \brief Find a transition event that would have made a specified node down
 393  *
 394  * \param[in] target  UUID of node to match
 395  * \param[in] quiet   If FALSE, log a warning if no match found
 396  *
 397  * \return Matching event if found, NULL otherwise
 398  */
 399 crm_action_t *
 400 match_down_event(const char *target, bool quiet)
     /* [previous][next][first][last][top][bottom][index][help] */
 401 {
 402     crm_action_t *match = NULL;
 403     xmlXPathObjectPtr xpath_ret = NULL;
 404     GListPtr gIter, gIter2;
 405 
 406     char *xpath = crm_strdup_printf(XPATH_DOWNED, target);
 407 
 408     for (gIter = transition_graph->synapses;
 409          gIter != NULL && match == NULL;
 410          gIter = gIter->next) {
 411 
 412         for (gIter2 = ((synapse_t*)gIter->data)->actions;
 413              gIter2 != NULL && match == NULL;
 414              gIter2 = gIter2->next) {
 415 
 416             match = (crm_action_t*)gIter2->data;
 417             xpath_ret = xpath_search(match->xml, xpath);
 418             if (numXpathResults(xpath_ret) < 1) {
 419                 match = NULL;
 420             }
 421             freeXpathObject(xpath_ret);
 422         }
 423     }
 424 
 425     free(xpath);
 426 
 427     if (match != NULL) {
 428         crm_debug("Shutdown action found for node %s: action %d (%s)",
 429                   target, match->id,
 430                   crm_element_value(match->xml, XML_LRM_ATTR_TASK_KEY));
 431 
 432     } else if(quiet == FALSE) {
 433         crm_warn("No reason to expect node %s to be down", target);
 434     }
 435 
 436     return match;
 437 }
 438 
 439 gboolean
 440 process_graph_event(xmlNode * event, const char *event_node)
     /* [previous][next][first][last][top][bottom][index][help] */
 441 {
 442     int rc = -1;
 443     int status = -1;
 444     int callid = -1;
 445 
 446     int action_num = -1;
 447     crm_action_t *action = NULL;
 448 
 449     int target_rc = -1;
 450     int transition_num = -1;
 451     char *update_te_uuid = NULL;
 452 
 453     gboolean stop_early = FALSE;
 454     gboolean ignore_failures = FALSE;
 455     const char *id = NULL;
 456     const char *desc = NULL;
 457     const char *magic = NULL;
 458 
 459     CRM_ASSERT(event != NULL);
 460 
 461 /*
 462 <lrm_rsc_op id="rsc_east-05_last_0" operation_key="rsc_east-05_monitor_0" operation="monitor" crm-debug-origin="do_update_resource" crm_feature_set="3.0.6" transition-key="9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" transition-magic="0:7;9:2:7:be2e97d9-05e2-439d-863e-48f7aecab2aa" call-id="17" rc-code="7" op-status="0" interval="0" last-run="1355361636" last-rc-change="1355361636" exec-time="128" queue-time="0" op-digest="c81f5f40b1c9e859c992e800b1aa6972"/>
 463 */
 464 
 465     id = crm_element_value(event, XML_LRM_ATTR_TASK_KEY);
 466     crm_element_value_int(event, XML_LRM_ATTR_RC, &rc);
 467     crm_element_value_int(event, XML_LRM_ATTR_OPSTATUS, &status);
 468     crm_element_value_int(event, XML_LRM_ATTR_CALLID, &callid);
 469 
 470     magic = crm_element_value(event, XML_ATTR_TRANSITION_KEY);
 471     if (magic == NULL) {
 472         /* non-change */
 473         return FALSE;
 474     }
 475 
 476     if (decode_transition_key(magic, &update_te_uuid, &transition_num,
 477                               &action_num, &target_rc) == FALSE) {
 478         crm_err("Invalid event %s.%d detected: %s", id, callid, magic);
 479         abort_transition(INFINITY, tg_restart, "Bad event", event);
 480         return FALSE;
 481     }
 482 
 483     if (status == PCMK_LRM_OP_PENDING) {
 484         goto bail;
 485     }
 486 
 487     if (transition_num == -1) {
 488         desc = "initiated outside of the cluster";
 489         abort_transition(INFINITY, tg_restart, "Unexpected event", event);
 490 
 491     } else if ((action_num < 0) || (crm_str_eq(update_te_uuid, te_uuid, TRUE) == FALSE)) {
 492         desc = "initiated by a different node";
 493         abort_transition(INFINITY, tg_restart, "Foreign event", event);
 494         stop_early = TRUE;      /* This could be an lrm status refresh */
 495 
 496     } else if (transition_graph->id != transition_num) {
 497         desc = "arrived really late";
 498         abort_transition(INFINITY, tg_restart, "Old event", event);
 499         stop_early = TRUE;      /* This could be an lrm status refresh */
 500 
 501     } else if (transition_graph->complete) {
 502         desc = "arrived late";
 503         abort_transition(INFINITY, tg_restart, "Inactive graph", event);
 504 
 505     } else {
 506         action = get_action(action_num, FALSE);
 507 
 508         if (action == NULL) {
 509             desc = "unknown";
 510             abort_transition(INFINITY, tg_restart, "Unknown event", event);
 511 
 512         } else {
 513             ignore_failures = safe_str_eq(
 514                 crm_meta_value(action->params, XML_OP_ATTR_ON_FAIL), "ignore");
 515             match_graph_event(action, event, status, rc, target_rc, ignore_failures);
 516         }
 517     }
 518 
 519     if (action && (rc == target_rc)) {
 520         crm_trace("Processed update to %s: %s", id, magic);
 521     } else {
 522         if (update_failcount(event, event_node, rc, target_rc,
 523                              (transition_num == -1), ignore_failures)) {
 524             /* Turns out this wasn't an lrm status refresh update afterall */
 525             stop_early = FALSE;
 526             desc = "failed";
 527         }
 528         crm_info("Detected action (%d.%d) %s.%d=%s: %s", transition_num,
 529                  action_num, id, callid, services_ocf_exitcode_str(rc), desc);
 530     }
 531 
 532   bail:
 533     free(update_te_uuid);
 534     return stop_early;
 535 }

/* [previous][next][first][last][top][bottom][index][help] */