root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. controld_configure_fencing
  4. too_many_st_failures
  5. st_fail_count_reset
  6. st_fail_count_increment
  7. cib_fencing_updated
  8. send_stonith_update
  9. abort_for_stonith_failure
  10. add_stonith_cleanup
  11. remove_stonith_cleanup
  12. purge_stonith_cleanup
  13. execute_stonith_cleanup
  14. fail_incompletable_stonith
  15. tengine_stonith_connection_destroy
  16. handle_fence_notification
  17. controld_timer_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. controld_execute_fence_action
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/common/xml.h>
  13 #include <crm/stonith-ng.h>
  14 #include <crm/fencing/internal.h>
  15 
  16 #include <pacemaker-controld.h>
  17 
  18 static void
  19 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  20 
  21 /*
  22  * stonith failure counting
  23  *
  24  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  25  * number of fencing failures for each target node, and the most we'll restart a
  26  * transition for.
  27  */
  28 
  29 struct st_fail_rec {
  30     int count;
  31 };
  32 
  33 #define DEFAULT_STONITH_MAX_ATTEMPTS 10
  34 
  35 static bool fence_reaction_panic = false;
  36 static unsigned long int stonith_max_attempts = DEFAULT_STONITH_MAX_ATTEMPTS;
  37 static GHashTable *stonith_failures = NULL;
  38 
  39 /*!
  40  * \internal
  41  * \brief Update max fencing attempts before giving up
  42  *
  43  * \param[in] value  New max fencing attempts
  44  */
  45 static void
  46 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  47 {
  48     int score = 0;
  49     int rc = pcmk_parse_score(value, &score, DEFAULT_STONITH_MAX_ATTEMPTS);
  50 
  51     // The option validator ensures invalid values shouldn't be possible
  52     CRM_CHECK((rc == pcmk_rc_ok) && (score > 0), return);
  53 
  54     if (stonith_max_attempts != score) {
  55         crm_debug("Maximum fencing attempts per transition is now %d (was %lu)",
  56                   score, stonith_max_attempts);
  57     }
  58     stonith_max_attempts = score;
  59 }
  60 
  61 /*!
  62  * \internal
  63  * \brief Configure reaction to notification of local node being fenced
  64  *
  65  * \param[in] reaction_s  Reaction type
  66  */
  67 static void
  68 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  69 {
  70     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  71         fence_reaction_panic = true;
  72 
  73     } else {
  74         if (!pcmk__str_eq(reaction_s, PCMK_VALUE_STOP, pcmk__str_casei)) {
  75             crm_warn("Invalid value '%s' for %s, using 'stop'",
  76                      reaction_s, PCMK_OPT_FENCE_REACTION);
  77         }
  78         fence_reaction_panic = false;
  79     }
  80 }
  81 
  82 /*!
  83  * \internal
  84  * \brief Configure fencing options based on the CIB
  85  *
  86  * \param[in,out] options  Name/value pairs for configured options
  87  */
  88 void
  89 controld_configure_fencing(GHashTable *options)
     /* [previous][next][first][last][top][bottom][index][help] */
  90 {
  91     const char *value = NULL;
  92 
  93     value = g_hash_table_lookup(options, PCMK_OPT_FENCE_REACTION);
  94     set_fence_reaction(value);
  95 
  96     value = g_hash_table_lookup(options, PCMK_OPT_STONITH_MAX_ATTEMPTS);
  97     update_stonith_max_attempts(value);
  98 }
  99 
 100 static gboolean
 101 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 102 {
 103     GHashTableIter iter;
 104     const char *key = NULL;
 105     struct st_fail_rec *value = NULL;
 106 
 107     if (stonith_failures == NULL) {
 108         return FALSE;
 109     }
 110 
 111     if (target == NULL) {
 112         g_hash_table_iter_init(&iter, stonith_failures);
 113         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 114                (gpointer *) &value)) {
 115 
 116             if (value->count >= stonith_max_attempts) {
 117                 target = (const char*)key;
 118                 goto too_many;
 119             }
 120         }
 121     } else {
 122         value = g_hash_table_lookup(stonith_failures, target);
 123         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
 124             goto too_many;
 125         }
 126     }
 127     return FALSE;
 128 
 129 too_many:
 130     crm_warn("Too many failures (%d) to fence %s, giving up",
 131              value->count, target);
 132     return TRUE;
 133 }
 134 
 135 /*!
 136  * \internal
 137  * \brief Reset a stonith fail count
 138  *
 139  * \param[in] target  Name of node to reset, or NULL for all
 140  */
 141 void
 142 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 143 {
 144     if (stonith_failures == NULL) {
 145         return;
 146     }
 147 
 148     if (target) {
 149         struct st_fail_rec *rec = NULL;
 150 
 151         rec = g_hash_table_lookup(stonith_failures, target);
 152         if (rec) {
 153             rec->count = 0;
 154         }
 155     } else {
 156         GHashTableIter iter;
 157         const char *key = NULL;
 158         struct st_fail_rec *rec = NULL;
 159 
 160         g_hash_table_iter_init(&iter, stonith_failures);
 161         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 162                                       (gpointer *) &rec)) {
 163             rec->count = 0;
 164         }
 165     }
 166 }
 167 
 168 static void
 169 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 170 {
 171     struct st_fail_rec *rec = NULL;
 172 
 173     if (stonith_failures == NULL) {
 174         stonith_failures = pcmk__strkey_table(free, free);
 175     }
 176 
 177     rec = g_hash_table_lookup(stonith_failures, target);
 178     if (rec) {
 179         rec->count++;
 180     } else {
 181         rec = malloc(sizeof(struct st_fail_rec));
 182         if(rec == NULL) {
 183             return;
 184         }
 185 
 186         rec->count = 1;
 187         g_hash_table_insert(stonith_failures, pcmk__str_copy(target), rec);
 188     }
 189 }
 190 
 191 /* end stonith fail count functions */
 192 
 193 
 194 static void
 195 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 196                     void *user_data)
 197 {
 198     if (rc < pcmk_ok) {
 199         crm_err("Fencing update %d for %s: failed - %s (%d)",
 200                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 201         crm_log_xml_warn(msg, "Failed update");
 202         abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_shutdown,
 203                          "CIB update failed", NULL);
 204 
 205     } else {
 206         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 207     }
 208 }
 209 
 210 static void
 211 send_stonith_update(pcmk__graph_action_t *action, const char *target,
     /* [previous][next][first][last][top][bottom][index][help] */
 212                     const char *uuid)
 213 {
 214     int rc = pcmk_ok;
 215     crm_node_t *peer = NULL;
 216 
 217     /* We (usually) rely on the membership layer to do node_update_cluster,
 218      * and the peer status callback to do node_update_peer, because the node
 219      * might have already rejoined before we get the stonith result here.
 220      */
 221     int flags = node_update_join | node_update_expected;
 222 
 223     /* zero out the node-status & remove all LRM status info */
 224     xmlNode *node_state = NULL;
 225 
 226     CRM_CHECK(target != NULL, return);
 227     CRM_CHECK(uuid != NULL, return);
 228 
 229     /* Make sure the membership and join caches are accurate.
 230      * Try getting any existing node cache entry also by node uuid in case it
 231      * doesn't have an uname yet.
 232      */
 233     peer = pcmk__get_node(0, target, uuid, pcmk__node_search_any);
 234 
 235     CRM_CHECK(peer != NULL, return);
 236 
 237     if (peer->state == NULL) {
 238         /* Usually, we rely on the membership layer to update the cluster state
 239          * in the CIB. However, if the node has never been seen, do it here, so
 240          * the node is not considered unclean.
 241          */
 242         flags |= node_update_cluster;
 243     }
 244 
 245     if (peer->uuid == NULL) {
 246         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 247         peer->uuid = pcmk__str_copy(uuid);
 248     }
 249 
 250     crmd_peer_down(peer, TRUE);
 251 
 252     /* Generate a node state update for the CIB */
 253     node_state = create_node_state_update(peer, flags, NULL, __func__);
 254 
 255     /* we have to mark whether or not remote nodes have already been fenced */
 256     if (peer->flags & crm_remote_node) {
 257         char *now_s = pcmk__ttoa(time(NULL));
 258 
 259         crm_xml_add(node_state, PCMK__XA_NODE_FENCED, now_s);
 260         free(now_s);
 261     }
 262 
 263     /* Force our known ID */
 264     crm_xml_add(node_state, PCMK_XA_ID, uuid);
 265 
 266     rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
 267                                                  PCMK_XE_STATUS, node_state,
 268                                                  cib_scope_local
 269                                                  |cib_can_create);
 270 
 271     /* Delay processing the trigger until the update completes */
 272     crm_debug("Sending fencing update %d for %s", rc, target);
 273     fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
 274 
 275     // Make sure it sticks
 276     /* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn,
 277      *                                             cib_scope_local);
 278      */
 279 
 280     controld_delete_node_state(peer->uname, controld_section_all,
 281                                cib_scope_local);
 282     free_xml(node_state);
 283     return;
 284 }
 285 
 286 /*!
 287  * \internal
 288  * \brief Abort transition due to stonith failure
 289  *
 290  * \param[in] abort_action  Whether to restart or stop transition
 291  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 292  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 293  */
 294 static void
 295 abort_for_stonith_failure(enum pcmk__graph_next abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 296                           const char *target, const xmlNode *reason)
 297 {
 298     /* If stonith repeatedly fails, we eventually give up on starting a new
 299      * transition for that reason.
 300      */
 301     if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
 302         abort_action = pcmk__graph_wait;
 303     }
 304     abort_transition(PCMK_SCORE_INFINITY, abort_action, "Stonith failed",
 305                      reason);
 306 }
 307 
 308 
 309 /*
 310  * stonith cleanup list
 311  *
 312  * If the DC is shot, proper notifications might not go out.
 313  * The stonith cleanup list allows the cluster to (re-)send
 314  * notifications once a new DC is elected.
 315  */
 316 
 317 static GList *stonith_cleanup_list = NULL;
 318 
 319 /*!
 320  * \internal
 321  * \brief Add a node to the stonith cleanup list
 322  *
 323  * \param[in] target  Name of node to add
 324  */
 325 void
 326 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 327     stonith_cleanup_list = g_list_append(stonith_cleanup_list,
 328                                          pcmk__str_copy(target));
 329 }
 330 
 331 /*!
 332  * \internal
 333  * \brief Remove a node from the stonith cleanup list
 334  *
 335  * \param[in] Name of node to remove
 336  */
 337 void
 338 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 339 {
 340     GList *iter = stonith_cleanup_list;
 341 
 342     while (iter != NULL) {
 343         GList *tmp = iter;
 344         char *iter_name = tmp->data;
 345 
 346         iter = iter->next;
 347         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 348             crm_trace("Removing %s from the cleanup list", iter_name);
 349             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 350             free(iter_name);
 351         }
 352     }
 353 }
 354 
 355 /*!
 356  * \internal
 357  * \brief Purge all entries from the stonith cleanup list
 358  */
 359 void
 360 purge_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 361 {
 362     if (stonith_cleanup_list) {
 363         GList *iter = NULL;
 364 
 365         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 366             char *target = iter->data;
 367 
 368             crm_info("Purging %s from stonith cleanup list", target);
 369             free(target);
 370         }
 371         g_list_free(stonith_cleanup_list);
 372         stonith_cleanup_list = NULL;
 373     }
 374 }
 375 
 376 /*!
 377  * \internal
 378  * \brief Send stonith updates for all entries in cleanup list, then purge it
 379  */
 380 void
 381 execute_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 382 {
 383     GList *iter;
 384 
 385     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 386         char *target = iter->data;
 387         crm_node_t *target_node =
 388             pcmk__get_node(0, target, NULL, pcmk__node_search_cluster_member);
 389         const char *uuid = pcmk__cluster_node_uuid(target_node);
 390 
 391         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 392         send_stonith_update(NULL, target, uuid);
 393         free(target);
 394     }
 395     g_list_free(stonith_cleanup_list);
 396     stonith_cleanup_list = NULL;
 397 }
 398 
 399 /* end stonith cleanup list functions */
 400 
 401 
 402 /* stonith API client
 403  *
 404  * Functions that need to interact directly with the fencer via its API
 405  */
 406 
 407 static stonith_t *stonith_api = NULL;
 408 static mainloop_timer_t *controld_fencer_connect_timer = NULL;
 409 static char *te_client_id = NULL;
 410 
 411 static gboolean
 412 fail_incompletable_stonith(pcmk__graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 413 {
 414     GList *lpc = NULL;
 415     const char *task = NULL;
 416     xmlNode *last_action = NULL;
 417 
 418     if (graph == NULL) {
 419         return FALSE;
 420     }
 421 
 422     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 423         GList *lpc2 = NULL;
 424         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
 425 
 426         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 427             continue;
 428         }
 429 
 430         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 431             pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
 432 
 433             if ((action->type != pcmk__cluster_graph_action)
 434                 || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 435                 continue;
 436             }
 437 
 438             task = crm_element_value(action->xml, PCMK_XA_OPERATION);
 439             if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
 440                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 441                 last_action = action->xml;
 442                 pcmk__update_graph(graph, action);
 443                 crm_notice("Failing action %d (%s): fencer terminated",
 444                            action->id, pcmk__xe_id(action->xml));
 445             }
 446         }
 447     }
 448 
 449     if (last_action != NULL) {
 450         crm_warn("Fencer failure resulted in unrunnable actions");
 451         abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
 452         return TRUE;
 453     }
 454 
 455     return FALSE;
 456 }
 457 
 458 static void
 459 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 460 {
 461     te_cleanup_stonith_history_sync(st, FALSE);
 462 
 463     if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
 464         crm_err("Lost fencer connection (will attempt to reconnect)");
 465         if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 466             mainloop_timer_start(controld_fencer_connect_timer);
 467         }
 468     } else {
 469         crm_info("Disconnected from fencer");
 470     }
 471 
 472     if (stonith_api) {
 473         /* the client API won't properly reconnect notifications
 474          * if they are still in the table - so remove them
 475          */
 476         if (stonith_api->state != stonith_disconnected) {
 477             stonith_api->cmds->disconnect(st);
 478         }
 479         stonith_api->cmds->remove_notification(stonith_api, NULL);
 480     }
 481 
 482     if (AM_I_DC) {
 483         fail_incompletable_stonith(controld_globals.transition_graph);
 484         trigger_graph();
 485     }
 486 }
 487 
 488 /*!
 489  * \internal
 490  * \brief Handle an event notification from the fencing API
 491  *
 492  * \param[in] st     Fencing API connection (ignored)
 493  * \param[in] event  Fencing API event notification
 494  */
 495 static void
 496 handle_fence_notification(stonith_t *st, stonith_event_t *event)
     /* [previous][next][first][last][top][bottom][index][help] */
 497 {
 498     bool succeeded = true;
 499     const char *executioner = "the cluster";
 500     const char *client = "a client";
 501     const char *reason = NULL;
 502     int exec_status;
 503 
 504     if (te_client_id == NULL) {
 505         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 506                                          (unsigned long) getpid());
 507     }
 508 
 509     if (event == NULL) {
 510         crm_err("Notify data not found");
 511         return;
 512     }
 513 
 514     if (event->executioner != NULL) {
 515         executioner = event->executioner;
 516     }
 517     if (event->client_origin != NULL) {
 518         client = event->client_origin;
 519     }
 520 
 521     exec_status = stonith__event_execution_status(event);
 522     if ((stonith__event_exit_status(event) != CRM_EX_OK)
 523         || (exec_status != PCMK_EXEC_DONE)) {
 524         succeeded = false;
 525         if (exec_status == PCMK_EXEC_DONE) {
 526             exec_status = PCMK_EXEC_ERROR;
 527         }
 528     }
 529     reason = stonith__event_exit_reason(event);
 530 
 531     crmd_alert_fencing_op(event);
 532 
 533     if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
 534         // Unfencing doesn't need special handling, just a log message
 535         if (succeeded) {
 536             crm_notice("%s was unfenced by %s at the request of %s@%s",
 537                        event->target, executioner, client, event->origin);
 538         } else {
 539             crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
 540                     event->target, executioner,
 541                     pcmk_exec_status_str(exec_status),
 542                     ((reason == NULL)? "" : ": "),
 543                     ((reason == NULL)? "" : reason),
 544                     stonith__event_exit_status(event));
 545         }
 546         return;
 547     }
 548 
 549     if (succeeded
 550         && pcmk__str_eq(event->target, controld_globals.our_nodename,
 551                         pcmk__str_casei)) {
 552         /* We were notified of our own fencing. Most likely, either fencing was
 553          * misconfigured, or fabric fencing that doesn't cut cluster
 554          * communication is in use.
 555          *
 556          * Either way, shutting down the local host is a good idea, to require
 557          * administrator intervention. Also, other nodes would otherwise likely
 558          * set our status to lost because of the fencing callback and discard
 559          * our subsequent election votes as "not part of our cluster".
 560          */
 561         crm_crit("We were allegedly just fenced by %s for %s!",
 562                  executioner, event->origin); // Dumps blackbox if enabled
 563         if (fence_reaction_panic) {
 564             pcmk__panic(__func__);
 565         } else {
 566             crm_exit(CRM_EX_FATAL);
 567         }
 568         return; // Should never get here
 569     }
 570 
 571     /* Update the count of fencing failures for this target, in case we become
 572      * DC later. The current DC has already updated its fail count in
 573      * tengine_stonith_callback().
 574      */
 575     if (!AM_I_DC) {
 576         if (succeeded) {
 577             st_fail_count_reset(event->target);
 578         } else {
 579             st_fail_count_increment(event->target);
 580         }
 581     }
 582 
 583     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
 584                "%s%s%s%s " CRM_XS " event=%s",
 585                event->target, (succeeded? "" : " not"),
 586                event->action, executioner, client, event->origin,
 587                (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
 588                ((reason == NULL)? "" : " ("),
 589                ((reason == NULL)? "" : reason),
 590                ((reason == NULL)? "" : ")"),
 591                event->id);
 592 
 593     if (succeeded) {
 594         const uint32_t flags = pcmk__node_search_any
 595                                |pcmk__node_search_cluster_cib;
 596 
 597         crm_node_t *peer = pcmk__search_node_caches(0, event->target, flags);
 598         const char *uuid = NULL;
 599 
 600         if (peer == NULL) {
 601             return;
 602         }
 603 
 604         uuid = pcmk__cluster_node_uuid(peer);
 605 
 606         if (AM_I_DC) {
 607             /* The DC always sends updates */
 608             send_stonith_update(NULL, event->target, uuid);
 609 
 610             /* @TODO Ideally, at this point, we'd check whether the fenced node
 611              * hosted any guest nodes, and call remote_node_down() for them.
 612              * Unfortunately, the controller doesn't have a simple, reliable way
 613              * to map hosts to guests. It might be possible to track this in the
 614              * peer cache via refresh_remote_nodes(). For now, we rely on the
 615              * scheduler creating fence pseudo-events for the guests.
 616              */
 617 
 618             if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
 619                 /* Abort the current transition if it wasn't the cluster that
 620                  * initiated fencing.
 621                  */
 622                 crm_info("External fencing operation from %s fenced %s",
 623                          client, event->target);
 624                 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
 625                                  "External Fencing Operation", NULL);
 626             }
 627 
 628         } else if (pcmk__str_eq(controld_globals.dc_name, event->target,
 629                                 pcmk__str_null_matches|pcmk__str_casei)
 630                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 631             // Assume the target was our DC if we don't currently have one
 632 
 633             if (controld_globals.dc_name != NULL) {
 634                 crm_notice("Fencing target %s was our DC", event->target);
 635             } else {
 636                 crm_notice("Fencing target %s may have been our DC",
 637                            event->target);
 638             }
 639 
 640             /* Given the CIB resyncing that occurs around elections,
 641              * have one node update the CIB now and, if the new DC is different,
 642              * have them do so too after the election
 643              */
 644             if (pcmk__str_eq(event->executioner, controld_globals.our_nodename,
 645                              pcmk__str_casei)) {
 646                 send_stonith_update(NULL, event->target, uuid);
 647             }
 648             add_stonith_cleanup(event->target);
 649         }
 650 
 651         /* If the target is a remote node, and we host its connection,
 652          * immediately fail all monitors so it can be recovered quickly.
 653          * The connection won't necessarily drop when a remote node is fenced,
 654          * so the failure might not otherwise be detected until the next poke.
 655          */
 656         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 657             remote_ra_fail(event->target);
 658         }
 659 
 660         crmd_peer_down(peer, TRUE);
 661      }
 662 }
 663 
 664 /*!
 665  * \brief Connect to fencer
 666  *
 667  * \param[in] user_data  If NULL, retry failures now, otherwise retry in mainloop timer
 668  *
 669  * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
 670  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 671  *       30 attempts, meaning the controller could be blocked as long as 58s.
 672  */
 673 gboolean
 674 controld_timer_fencer_connect(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 675 {
 676     int rc = pcmk_ok;
 677 
 678     if (stonith_api == NULL) {
 679         stonith_api = stonith_api_new();
 680         if (stonith_api == NULL) {
 681             crm_err("Could not connect to fencer: API memory allocation failed");
 682             return G_SOURCE_REMOVE;
 683         }
 684     }
 685 
 686     if (stonith_api->state != stonith_disconnected) {
 687         crm_trace("Already connected to fencer, no need to retry");
 688         return G_SOURCE_REMOVE;
 689     }
 690 
 691     if (user_data == NULL) {
 692         // Blocking (retry failures now until successful)
 693         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 694         if (rc != pcmk_ok) {
 695             crm_err("Could not connect to fencer in 30 attempts: %s "
 696                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 697         }
 698     } else {
 699         // Non-blocking (retry failures later in main loop)
 700         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 701 
 702         if (controld_fencer_connect_timer == NULL) {
 703             controld_fencer_connect_timer =
 704                 mainloop_timer_add("controld_fencer_connect", 1000,
 705                                    TRUE, controld_timer_fencer_connect,
 706                                    GINT_TO_POINTER(TRUE));
 707         }
 708 
 709         if (rc != pcmk_ok) {
 710             if (pcmk_is_set(controld_globals.fsa_input_register,
 711                             R_ST_REQUIRED)) {
 712                 crm_notice("Fencer connection failed (will retry): %s "
 713                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 714 
 715                 if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 716                     mainloop_timer_start(controld_fencer_connect_timer);
 717                 }
 718 
 719                 return G_SOURCE_CONTINUE;
 720             } else {
 721                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 722                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 723             }
 724             return G_SOURCE_REMOVE;
 725         }
 726     }
 727 
 728     if (rc == pcmk_ok) {
 729         stonith_api_operations_t *cmds = stonith_api->cmds;
 730 
 731         cmds->register_notification(stonith_api,
 732                                     PCMK__VALUE_ST_NOTIFY_DISCONNECT,
 733                                     tengine_stonith_connection_destroy);
 734         cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_FENCE,
 735                                     handle_fence_notification);
 736         cmds->register_notification(stonith_api,
 737                                     PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED,
 738                                     tengine_stonith_history_synced);
 739         te_trigger_stonith_history_sync(TRUE);
 740         crm_notice("Fencer successfully connected");
 741     }
 742 
 743     return G_SOURCE_REMOVE;
 744 }
 745 
 746 void
 747 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 748 {
 749     if (stonith_api) {
 750         // Prevent fencer connection from coming up again
 751         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 752 
 753         if (stonith_api->state != stonith_disconnected) {
 754             stonith_api->cmds->disconnect(stonith_api);
 755         }
 756         stonith_api->cmds->remove_notification(stonith_api, NULL);
 757     }
 758     if (destroy) {
 759         if (stonith_api) {
 760             stonith_api->cmds->free(stonith_api);
 761             stonith_api = NULL;
 762         }
 763         if (controld_fencer_connect_timer) {
 764             mainloop_timer_del(controld_fencer_connect_timer);
 765             controld_fencer_connect_timer = NULL;
 766         }
 767         if (te_client_id) {
 768             free(te_client_id);
 769             te_client_id = NULL;
 770         }
 771     }
 772 }
 773 
 774 static gboolean
 775 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 776 {
 777     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 778         stonith_history_t *history = NULL;
 779 
 780         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 781         stonith_api->cmds->history(stonith_api,
 782                                    st_opt_sync_call | st_opt_broadcast,
 783                                    NULL, &history, 5);
 784         stonith_history_free(history);
 785         return TRUE;
 786     } else {
 787         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 788         return FALSE;
 789     }
 790 }
 791 
 792 static void
 793 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 794 {
 795     char *uuid = NULL;
 796     int stonith_id = -1;
 797     int transition_id = -1;
 798     pcmk__graph_action_t *action = NULL;
 799     const char *target = NULL;
 800 
 801     if ((data == NULL) || (data->userdata == NULL)) {
 802         crm_err("Ignoring fence operation %d result: "
 803                 "No transition key given (bug?)",
 804                 ((data == NULL)? -1 : data->call_id));
 805         return;
 806     }
 807 
 808     if (!AM_I_DC) {
 809         const char *reason = stonith__exit_reason(data);
 810 
 811         if (reason == NULL) {
 812            reason = pcmk_exec_status_str(stonith__execution_status(data));
 813         }
 814         crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
 815                    data->call_id, stonith__exit_status(data), reason,
 816                    (const char *) data->userdata);
 817         return;
 818     }
 819 
 820     CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
 821                                     &stonith_id, NULL),
 822               goto bail);
 823 
 824     if (controld_globals.transition_graph->complete || (stonith_id < 0)
 825         || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
 826         || (controld_globals.transition_graph->id != transition_id)) {
 827         crm_info("Ignoring fence operation %d result: "
 828                  "Not from current transition " CRM_XS
 829                  " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
 830                  data->call_id,
 831                  pcmk__btoa(controld_globals.transition_graph->complete),
 832                  stonith_id, uuid, controld_globals.te_uuid, transition_id,
 833                  controld_globals.transition_graph->id);
 834         goto bail;
 835     }
 836 
 837     action = controld_get_action(stonith_id);
 838     if (action == NULL) {
 839         crm_err("Ignoring fence operation %d result: "
 840                 "Action %d not found in transition graph (bug?) "
 841                 CRM_XS " uuid=%s transition=%d",
 842                 data->call_id, stonith_id, uuid, transition_id);
 843         goto bail;
 844     }
 845 
 846     target = crm_element_value(action->xml, PCMK__META_ON_NODE);
 847     if (target == NULL) {
 848         crm_err("Ignoring fence operation %d result: No target given (bug?)",
 849                 data->call_id);
 850         goto bail;
 851     }
 852 
 853     stop_te_timer(action);
 854     if (stonith__exit_status(data) == CRM_EX_OK) {
 855         const char *uuid = crm_element_value(action->xml,
 856                                              PCMK__META_ON_NODE_UUID);
 857         const char *op = crm_meta_value(action->params,
 858                                         PCMK__META_STONITH_ACTION);
 859 
 860         crm_info("Fence operation %d for %s succeeded", data->call_id, target);
 861         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 862             te_action_confirmed(action, NULL);
 863             if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
 864                 const char *value = NULL;
 865                 char *now = pcmk__ttoa(time(NULL));
 866                 gboolean is_remote_node = FALSE;
 867 
 868                 /* This check is not 100% reliable, since this node is not
 869                  * guaranteed to have the remote node cached. However, it
 870                  * doesn't have to be reliable, since the attribute manager can
 871                  * learn a node's "remoteness" by other means sooner or later.
 872                  * This allows it to learn more quickly if this node does have
 873                  * the information.
 874                  */
 875                 if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
 876                     is_remote_node = TRUE;
 877                 }
 878 
 879                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 880                              is_remote_node);
 881                 free(now);
 882 
 883                 value = crm_meta_value(action->params, PCMK__META_DIGESTS_ALL);
 884                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 885                              is_remote_node);
 886 
 887                 value = crm_meta_value(action->params,
 888                                        PCMK__META_DIGESTS_SECURE);
 889                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 890                              is_remote_node);
 891 
 892             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 893                 send_stonith_update(action, target, uuid);
 894                 pcmk__set_graph_action_flags(action,
 895                                              pcmk__graph_action_sent_update);
 896             }
 897         }
 898         st_fail_count_reset(target);
 899 
 900     } else {
 901         enum pcmk__graph_next abort_action = pcmk__graph_restart;
 902         int status = stonith__execution_status(data);
 903         const char *reason = stonith__exit_reason(data);
 904 
 905         if (reason == NULL) {
 906             if (status == PCMK_EXEC_DONE) {
 907                 reason = "Agent returned error";
 908             } else {
 909                 reason = pcmk_exec_status_str(status);
 910             }
 911         }
 912         pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 913 
 914         /* If no fence devices were available, there's no use in immediately
 915          * checking again, so don't start a new transition in that case.
 916          */
 917         if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
 918             crm_warn("Fence operation %d for %s failed: %s "
 919                      "(aborting transition and giving up for now)",
 920                      data->call_id, target, reason);
 921             abort_action = pcmk__graph_wait;
 922         } else {
 923             crm_notice("Fence operation %d for %s failed: %s "
 924                        "(aborting transition)", data->call_id, target, reason);
 925         }
 926 
 927         /* Increment the fail count now, so abort_for_stonith_failure() can
 928          * check it. Non-DC nodes will increment it in
 929          * handle_fence_notification().
 930          */
 931         st_fail_count_increment(target);
 932         abort_for_stonith_failure(abort_action, target, NULL);
 933     }
 934 
 935     pcmk__update_graph(controld_globals.transition_graph, action);
 936     trigger_graph();
 937 
 938   bail:
 939     free(data->userdata);
 940     free(uuid);
 941     return;
 942 }
 943 
 944 static int
 945 fence_with_delay(const char *target, const char *type, int delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 946 {
 947     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 948     int timeout_sec = (int) (controld_globals.transition_graph->stonith_timeout
 949                              / 1000);
 950 
 951     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 952         stonith__set_call_options(options, target, st_opt_allow_self_fencing);
 953     }
 954     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 955                                                type, timeout_sec, 0, delay);
 956 }
 957 
 958 /*!
 959  * \internal
 960  * \brief Execute a fencing action from a transition graph
 961  *
 962  * \param[in] graph   Transition graph being executed (ignored)
 963  * \param[in] action  Fencing action to execute
 964  *
 965  * \return Standard Pacemaker return code
 966  */
 967 int
 968 controld_execute_fence_action(pcmk__graph_t *graph,
     /* [previous][next][first][last][top][bottom][index][help] */
 969                               pcmk__graph_action_t *action)
 970 {
 971     int rc = 0;
 972     const char *id = pcmk__xe_id(action->xml);
 973     const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID);
 974     const char *target = crm_element_value(action->xml, PCMK__META_ON_NODE);
 975     const char *type = crm_meta_value(action->params,
 976                                       PCMK__META_STONITH_ACTION);
 977     char *transition_key = NULL;
 978     const char *priority_delay = NULL;
 979     int delay_i = 0;
 980     gboolean invalid_action = FALSE;
 981     int stonith_timeout = (int) (controld_globals.transition_graph->stonith_timeout
 982                                  / 1000);
 983 
 984     CRM_CHECK(id != NULL, invalid_action = TRUE);
 985     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 986     CRM_CHECK(type != NULL, invalid_action = TRUE);
 987     CRM_CHECK(target != NULL, invalid_action = TRUE);
 988 
 989     if (invalid_action) {
 990         crm_log_xml_warn(action->xml, "BadAction");
 991         return EPROTO;
 992     }
 993 
 994     priority_delay = crm_meta_value(action->params,
 995                                     PCMK_OPT_PRIORITY_FENCING_DELAY);
 996 
 997     crm_notice("Requesting fencing (%s) targeting node %s "
 998                CRM_XS " action=%s timeout=%i%s%s",
 999                type, target, id, stonith_timeout,
1000                priority_delay ? " priority_delay=" : "",
1001                priority_delay ? priority_delay : "");
1002 
1003     /* Passing NULL means block until we can connect... */
1004     controld_timer_fencer_connect(NULL);
1005 
1006     pcmk__scan_min_int(priority_delay, &delay_i, 0);
1007     rc = fence_with_delay(target, type, delay_i);
1008     transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
1009                                           action->id, 0,
1010                                           controld_globals.te_uuid),
1011     stonith_api->cmds->register_callback(stonith_api, rc,
1012                                          (stonith_timeout
1013                                           + (delay_i > 0 ? delay_i : 0)),
1014                                          st_opt_timeout_updates, transition_key,
1015                                          "tengine_stonith_callback",
1016                                          tengine_stonith_callback);
1017     return pcmk_rc_ok;
1018 }
1019 
1020 bool
1021 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
1022 {
1023     long long st_timeout = (value != NULL)? crm_get_msec(value) : 0;
1024     const char *our_nodename = controld_globals.our_nodename;
1025 
1026     if (st_timeout == 0
1027         || (stonith_api && (stonith_api->state != stonith_disconnected) &&
1028             stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
1029                                                            our_nodename))) {
1030         return pcmk__valid_stonith_watchdog_timeout(value);
1031     }
1032     return true;
1033 }
1034 
1035 /* end stonith API client functions */
1036 
1037 
1038 /*
1039  * stonith history synchronization
1040  *
1041  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
1042  * joins or leaves, we need to synchronize the history across all nodes.
1043  */
1044 
1045 static crm_trigger_t *stonith_history_sync_trigger = NULL;
1046 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
1047 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
1048 
1049 void
1050 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
1051 {
1052     if (free_timers) {
1053         mainloop_timer_del(stonith_history_sync_timer_short);
1054         stonith_history_sync_timer_short = NULL;
1055         mainloop_timer_del(stonith_history_sync_timer_long);
1056         stonith_history_sync_timer_long = NULL;
1057     } else {
1058         mainloop_timer_stop(stonith_history_sync_timer_short);
1059         mainloop_timer_stop(stonith_history_sync_timer_long);
1060     }
1061 
1062     if (st) {
1063         st->cmds->remove_notification(st, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED);
1064     }
1065 }
1066 
1067 static void
1068 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
1069 {
1070     te_cleanup_stonith_history_sync(st, FALSE);
1071     crm_debug("Fence-history synced - cancel all timers");
1072 }
1073 
1074 static gboolean
1075 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1076 {
1077     mainloop_set_trigger(stonith_history_sync_trigger);
1078     return FALSE;
1079 }
1080 
1081 void
1082 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1083 {
1084     /* trigger a sync in 5s to give more nodes the
1085      * chance to show up so that we don't create
1086      * unnecessary stonith-history-sync traffic
1087      *
1088      * the long timeout of 30s is there as a fallback
1089      * so that after a successful connection to fenced
1090      * we will wait for 30s for the DC to trigger a
1091      * history-sync
1092      * if this doesn't happen we trigger a sync locally
1093      * (e.g. fenced segfaults and is restarted by pacemakerd)
1094      */
1095 
1096     /* as we are finally checking the stonith-connection
1097      * in do_stonith_history_sync we should be fine
1098      * leaving stonith_history_sync_time & stonith_history_sync_trigger
1099      * around
1100      */
1101     if (stonith_history_sync_trigger == NULL) {
1102         stonith_history_sync_trigger =
1103             mainloop_add_trigger(G_PRIORITY_LOW,
1104                                  do_stonith_history_sync, NULL);
1105     }
1106 
1107     if (long_timeout) {
1108         if(stonith_history_sync_timer_long == NULL) {
1109             stonith_history_sync_timer_long =
1110                 mainloop_timer_add("history_sync_long", 30000,
1111                                    FALSE, stonith_history_sync_set_trigger,
1112                                    NULL);
1113         }
1114         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1115         mainloop_timer_start(stonith_history_sync_timer_long);
1116     } else {
1117         if(stonith_history_sync_timer_short == NULL) {
1118             stonith_history_sync_timer_short =
1119                 mainloop_timer_add("history_sync_short", 5000,
1120                                    FALSE, stonith_history_sync_set_trigger,
1121                                    NULL);
1122         }
1123         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1124         mainloop_timer_start(stonith_history_sync_timer_short);
1125     }
1126 
1127 }
1128 
1129 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */