root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. controld_configure_fencing
  4. too_many_st_failures
  5. st_fail_count_reset
  6. st_fail_count_increment
  7. cib_fencing_updated
  8. update_node_state_after_fencing
  9. abort_for_stonith_failure
  10. add_stonith_cleanup
  11. remove_stonith_cleanup
  12. purge_stonith_cleanup
  13. execute_stonith_cleanup
  14. fail_incompletable_stonith
  15. tengine_stonith_connection_destroy
  16. handle_fence_notification
  17. controld_timer_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. controld_execute_fence_action
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2025 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/common/xml.h>
  13 #include <crm/stonith-ng.h>
  14 #include <crm/fencing/internal.h>
  15 
  16 #include <pacemaker-controld.h>
  17 
  18 static void
  19 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  20 
  21 /*
  22  * stonith failure counting
  23  *
  24  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  25  * number of fencing failures for each target node, and the most we'll restart a
  26  * transition for.
  27  */
  28 
  29 struct st_fail_rec {
  30     int count;
  31 };
  32 
  33 #define DEFAULT_STONITH_MAX_ATTEMPTS 10
  34 
  35 static bool fence_reaction_panic = false;
  36 static unsigned long int stonith_max_attempts = DEFAULT_STONITH_MAX_ATTEMPTS;
  37 static GHashTable *stonith_failures = NULL;
  38 
  39 /*!
  40  * \internal
  41  * \brief Update max fencing attempts before giving up
  42  *
  43  * \param[in] value  New max fencing attempts
  44  */
  45 static void
  46 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  47 {
  48     int score = 0;
  49     int rc = pcmk_parse_score(value, &score, DEFAULT_STONITH_MAX_ATTEMPTS);
  50 
  51     // The option validator ensures invalid values shouldn't be possible
  52     CRM_CHECK((rc == pcmk_rc_ok) && (score > 0), return);
  53 
  54     if (stonith_max_attempts != score) {
  55         crm_debug("Maximum fencing attempts per transition is now %d (was %lu)",
  56                   score, stonith_max_attempts);
  57     }
  58     stonith_max_attempts = score;
  59 }
  60 
  61 /*!
  62  * \internal
  63  * \brief Configure reaction to notification of local node being fenced
  64  *
  65  * \param[in] reaction_s  Reaction type
  66  */
  67 static void
  68 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  69 {
  70     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  71         fence_reaction_panic = true;
  72 
  73     } else {
  74         if (!pcmk__str_eq(reaction_s, PCMK_VALUE_STOP, pcmk__str_casei)) {
  75             crm_warn("Invalid value '%s' for %s, using 'stop'",
  76                      reaction_s, PCMK_OPT_FENCE_REACTION);
  77         }
  78         fence_reaction_panic = false;
  79     }
  80 }
  81 
  82 /*!
  83  * \internal
  84  * \brief Configure fencing options based on the CIB
  85  *
  86  * \param[in,out] options  Name/value pairs for configured options
  87  */
  88 void
  89 controld_configure_fencing(GHashTable *options)
     /* [previous][next][first][last][top][bottom][index][help] */
  90 {
  91     const char *value = NULL;
  92 
  93     value = g_hash_table_lookup(options, PCMK_OPT_FENCE_REACTION);
  94     set_fence_reaction(value);
  95 
  96     value = g_hash_table_lookup(options, PCMK_OPT_STONITH_MAX_ATTEMPTS);
  97     update_stonith_max_attempts(value);
  98 }
  99 
 100 static gboolean
 101 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 102 {
 103     GHashTableIter iter;
 104     const char *key = NULL;
 105     struct st_fail_rec *value = NULL;
 106 
 107     if (stonith_failures == NULL) {
 108         return FALSE;
 109     }
 110 
 111     if (target == NULL) {
 112         g_hash_table_iter_init(&iter, stonith_failures);
 113         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 114                (gpointer *) &value)) {
 115 
 116             if (value->count >= stonith_max_attempts) {
 117                 target = (const char*)key;
 118                 goto too_many;
 119             }
 120         }
 121     } else {
 122         value = g_hash_table_lookup(stonith_failures, target);
 123         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
 124             goto too_many;
 125         }
 126     }
 127     return FALSE;
 128 
 129 too_many:
 130     crm_warn("Too many failures (%d) to fence %s, giving up",
 131              value->count, target);
 132     return TRUE;
 133 }
 134 
 135 /*!
 136  * \internal
 137  * \brief Reset a stonith fail count
 138  *
 139  * \param[in] target  Name of node to reset, or NULL for all
 140  */
 141 void
 142 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 143 {
 144     if (stonith_failures == NULL) {
 145         return;
 146     }
 147 
 148     if (target) {
 149         struct st_fail_rec *rec = NULL;
 150 
 151         rec = g_hash_table_lookup(stonith_failures, target);
 152         if (rec) {
 153             rec->count = 0;
 154         }
 155     } else {
 156         GHashTableIter iter;
 157         const char *key = NULL;
 158         struct st_fail_rec *rec = NULL;
 159 
 160         g_hash_table_iter_init(&iter, stonith_failures);
 161         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 162                                       (gpointer *) &rec)) {
 163             rec->count = 0;
 164         }
 165     }
 166 }
 167 
 168 static void
 169 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 170 {
 171     struct st_fail_rec *rec = NULL;
 172 
 173     if (stonith_failures == NULL) {
 174         stonith_failures = pcmk__strkey_table(free, free);
 175     }
 176 
 177     rec = g_hash_table_lookup(stonith_failures, target);
 178     if (rec) {
 179         rec->count++;
 180     } else {
 181         rec = malloc(sizeof(struct st_fail_rec));
 182         if(rec == NULL) {
 183             return;
 184         }
 185 
 186         rec->count = 1;
 187         g_hash_table_insert(stonith_failures, pcmk__str_copy(target), rec);
 188     }
 189 }
 190 
 191 /* end stonith fail count functions */
 192 
 193 
 194 static void
 195 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 196                     void *user_data)
 197 {
 198     if (rc < pcmk_ok) {
 199         crm_err("Fencing update %d for %s: failed - %s (%d)",
 200                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 201         crm_log_xml_warn(msg, "Failed update");
 202         abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_shutdown,
 203                          "CIB update failed", NULL);
 204 
 205     } else {
 206         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 207     }
 208 }
 209 
 210 /*!
 211  * \internal
 212  * \brief Update a fencing target's node state
 213  *
 214  * \param[in] target         Node that was successfully fenced
 215  * \param[in] target_xml_id  CIB XML ID of target
 216  */
 217 static void
 218 update_node_state_after_fencing(const char *target, const char *target_xml_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 219 {
 220     int rc = pcmk_ok;
 221     pcmk__node_status_t *peer = NULL;
 222     xmlNode *node_state = NULL;
 223 
 224     /* We (usually) rely on the membership layer to do
 225      * controld_node_update_cluster, and the peer status callback to do
 226      * controld_node_update_peer, because the node might have already rejoined
 227      * before we get the stonith result here.
 228      */
 229     uint32_t flags = controld_node_update_join|controld_node_update_expected;
 230 
 231     CRM_CHECK((target != NULL) && (target_xml_id != NULL), return);
 232 
 233     // Ensure target is cached
 234     peer = pcmk__get_node(0, target, target_xml_id, pcmk__node_search_any);
 235     CRM_CHECK(peer != NULL, return);
 236 
 237     if (peer->state == NULL) {
 238         /* Usually, we rely on the membership layer to update the cluster state
 239          * in the CIB. However, if the node has never been seen, do it here, so
 240          * the node is not considered unclean.
 241          */
 242         flags |= controld_node_update_cluster;
 243     }
 244 
 245     if (peer->xml_id == NULL) {
 246         crm_info("Recording XML ID '%s' for node '%s'", target_xml_id, target);
 247         peer->xml_id = pcmk__str_copy(target_xml_id);
 248     }
 249 
 250     crmd_peer_down(peer, TRUE);
 251 
 252     node_state = create_node_state_update(peer, flags, NULL, __func__);
 253     crm_xml_add(node_state, PCMK_XA_ID, target_xml_id);
 254 
 255     if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) {
 256         char *now_s = pcmk__ttoa(time(NULL));
 257 
 258         crm_xml_add(node_state, PCMK__XA_NODE_FENCED, now_s);
 259         free(now_s);
 260     }
 261 
 262     rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
 263                                                  PCMK_XE_STATUS, node_state,
 264                                                  cib_can_create);
 265     pcmk__xml_free(node_state);
 266 
 267     crm_debug("Updating node state for %s after fencing (call %d)", target, rc);
 268     fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
 269 
 270     controld_delete_node_state(peer->name, controld_section_all, cib_none);
 271 }
 272 
 273 /*!
 274  * \internal
 275  * \brief Abort transition due to stonith failure
 276  *
 277  * \param[in] abort_action  Whether to restart or stop transition
 278  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 279  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 280  */
 281 static void
 282 abort_for_stonith_failure(enum pcmk__graph_next abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 283                           const char *target, const xmlNode *reason)
 284 {
 285     /* If stonith repeatedly fails, we eventually give up on starting a new
 286      * transition for that reason.
 287      */
 288     if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
 289         abort_action = pcmk__graph_wait;
 290     }
 291     abort_transition(PCMK_SCORE_INFINITY, abort_action, "Stonith failed",
 292                      reason);
 293 }
 294 
 295 
 296 /*
 297  * stonith cleanup list
 298  *
 299  * If the DC is shot, proper notifications might not go out.
 300  * The stonith cleanup list allows the cluster to (re-)send
 301  * notifications once a new DC is elected.
 302  */
 303 
 304 static GList *stonith_cleanup_list = NULL;
 305 
 306 /*!
 307  * \internal
 308  * \brief Add a node to the stonith cleanup list
 309  *
 310  * \param[in] target  Name of node to add
 311  */
 312 void
 313 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 314     stonith_cleanup_list = g_list_append(stonith_cleanup_list,
 315                                          pcmk__str_copy(target));
 316 }
 317 
 318 /*!
 319  * \internal
 320  * \brief Remove a node from the stonith cleanup list
 321  *
 322  * \param[in] Name of node to remove
 323  */
 324 void
 325 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 326 {
 327     GList *iter = stonith_cleanup_list;
 328 
 329     while (iter != NULL) {
 330         GList *tmp = iter;
 331         char *iter_name = tmp->data;
 332 
 333         iter = iter->next;
 334         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 335             crm_trace("Removing %s from the cleanup list", iter_name);
 336             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 337             free(iter_name);
 338         }
 339     }
 340 }
 341 
 342 /*!
 343  * \internal
 344  * \brief Purge all entries from the stonith cleanup list
 345  */
 346 void
 347 purge_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 348 {
 349     if (stonith_cleanup_list) {
 350         GList *iter = NULL;
 351 
 352         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 353             char *target = iter->data;
 354 
 355             crm_info("Purging %s from stonith cleanup list", target);
 356             free(target);
 357         }
 358         g_list_free(stonith_cleanup_list);
 359         stonith_cleanup_list = NULL;
 360     }
 361 }
 362 
 363 /*!
 364  * \internal
 365  * \brief Send stonith updates for all entries in cleanup list, then purge it
 366  */
 367 void
 368 execute_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 369 {
 370     GList *iter;
 371 
 372     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 373         char *target = iter->data;
 374         pcmk__node_status_t *target_node =
 375             pcmk__get_node(0, target, NULL, pcmk__node_search_cluster_member);
 376         const char *uuid = pcmk__cluster_get_xml_id(target_node);
 377 
 378         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 379         update_node_state_after_fencing(target, uuid);
 380         free(target);
 381     }
 382     g_list_free(stonith_cleanup_list);
 383     stonith_cleanup_list = NULL;
 384 }
 385 
 386 /* end stonith cleanup list functions */
 387 
 388 
 389 /* stonith API client
 390  *
 391  * Functions that need to interact directly with the fencer via its API
 392  */
 393 
 394 static stonith_t *stonith_api = NULL;
 395 static mainloop_timer_t *controld_fencer_connect_timer = NULL;
 396 static char *te_client_id = NULL;
 397 
 398 static gboolean
 399 fail_incompletable_stonith(pcmk__graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 400 {
 401     GList *lpc = NULL;
 402     const char *task = NULL;
 403     xmlNode *last_action = NULL;
 404 
 405     if (graph == NULL) {
 406         return FALSE;
 407     }
 408 
 409     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 410         GList *lpc2 = NULL;
 411         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
 412 
 413         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 414             continue;
 415         }
 416 
 417         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 418             pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
 419 
 420             if ((action->type != pcmk__cluster_graph_action)
 421                 || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 422                 continue;
 423             }
 424 
 425             task = crm_element_value(action->xml, PCMK_XA_OPERATION);
 426             if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
 427                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 428                 last_action = action->xml;
 429                 pcmk__update_graph(graph, action);
 430                 crm_notice("Failing action %d (%s): fencer terminated",
 431                            action->id, pcmk__xe_id(action->xml));
 432             }
 433         }
 434     }
 435 
 436     if (last_action != NULL) {
 437         crm_warn("Fencer failure resulted in unrunnable actions");
 438         abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
 439         return TRUE;
 440     }
 441 
 442     return FALSE;
 443 }
 444 
 445 static void
 446 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 447 {
 448     te_cleanup_stonith_history_sync(st, FALSE);
 449 
 450     if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
 451         crm_err("Lost fencer connection (will attempt to reconnect)");
 452         if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 453             mainloop_timer_start(controld_fencer_connect_timer);
 454         }
 455     } else {
 456         crm_info("Disconnected from fencer");
 457     }
 458 
 459     if (stonith_api) {
 460         /* the client API won't properly reconnect notifications
 461          * if they are still in the table - so remove them
 462          */
 463         if (stonith_api->state != stonith_disconnected) {
 464             stonith_api->cmds->disconnect(st);
 465         }
 466         stonith_api->cmds->remove_notification(stonith_api, NULL);
 467     }
 468 
 469     if (AM_I_DC) {
 470         fail_incompletable_stonith(controld_globals.transition_graph);
 471         trigger_graph();
 472     }
 473 }
 474 
 475 /*!
 476  * \internal
 477  * \brief Handle an event notification from the fencing API
 478  *
 479  * \param[in] st     Fencing API connection (ignored)
 480  * \param[in] event  Fencing API event notification
 481  */
 482 static void
 483 handle_fence_notification(stonith_t *st, stonith_event_t *event)
     /* [previous][next][first][last][top][bottom][index][help] */
 484 {
 485     bool succeeded = true;
 486     const char *executioner = "the cluster";
 487     const char *client = "a client";
 488     const char *reason = NULL;
 489     int exec_status;
 490 
 491     if (te_client_id == NULL) {
 492         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 493                                          (unsigned long) getpid());
 494     }
 495 
 496     if (event == NULL) {
 497         crm_err("Notify data not found");
 498         return;
 499     }
 500 
 501     if (event->executioner != NULL) {
 502         executioner = event->executioner;
 503     }
 504     if (event->client_origin != NULL) {
 505         client = event->client_origin;
 506     }
 507 
 508     exec_status = stonith__event_execution_status(event);
 509     if ((stonith__event_exit_status(event) != CRM_EX_OK)
 510         || (exec_status != PCMK_EXEC_DONE)) {
 511         succeeded = false;
 512         if (exec_status == PCMK_EXEC_DONE) {
 513             exec_status = PCMK_EXEC_ERROR;
 514         }
 515     }
 516     reason = stonith__event_exit_reason(event);
 517 
 518     crmd_alert_fencing_op(event);
 519 
 520     if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
 521         // Unfencing doesn't need special handling, just a log message
 522         if (succeeded) {
 523             crm_notice("%s was unfenced by %s at the request of %s@%s",
 524                        event->target, executioner, client, event->origin);
 525         } else {
 526             crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
 527                     event->target, executioner,
 528                     pcmk_exec_status_str(exec_status),
 529                     ((reason == NULL)? "" : ": "),
 530                     ((reason == NULL)? "" : reason),
 531                     stonith__event_exit_status(event));
 532         }
 533         return;
 534     }
 535 
 536     if (succeeded && controld_is_local_node(event->target)) {
 537         /* We were notified of our own fencing. Most likely, either fencing was
 538          * misconfigured, or fabric fencing that doesn't cut cluster
 539          * communication is in use.
 540          *
 541          * Either way, shutting down the local host is a good idea, to require
 542          * administrator intervention. Also, other nodes would otherwise likely
 543          * set our status to lost because of the fencing callback and discard
 544          * our subsequent election votes as "not part of our cluster".
 545          */
 546         crm_crit("We were allegedly just fenced by %s for %s!",
 547                  executioner, event->origin); // Dumps blackbox if enabled
 548         if (fence_reaction_panic) {
 549             pcmk__panic("Notified of own fencing");
 550         } else {
 551             crm_exit(CRM_EX_FATAL);
 552         }
 553         return; // Should never get here
 554     }
 555 
 556     /* Update the count of fencing failures for this target, in case we become
 557      * DC later. The current DC has already updated its fail count in
 558      * tengine_stonith_callback().
 559      */
 560     if (!AM_I_DC) {
 561         if (succeeded) {
 562             st_fail_count_reset(event->target);
 563         } else {
 564             st_fail_count_increment(event->target);
 565         }
 566     }
 567 
 568     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
 569                "%s%s%s%s " QB_XS " event=%s",
 570                event->target, (succeeded? "" : " not"),
 571                event->action, executioner, client, event->origin,
 572                (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
 573                ((reason == NULL)? "" : " ("),
 574                ((reason == NULL)? "" : reason),
 575                ((reason == NULL)? "" : ")"),
 576                event->id);
 577 
 578     if (succeeded) {
 579         const uint32_t flags = pcmk__node_search_any
 580                                |pcmk__node_search_cluster_cib;
 581 
 582         pcmk__node_status_t *peer = pcmk__search_node_caches(0, event->target,
 583                                                              NULL, flags);
 584         const char *uuid = NULL;
 585 
 586         if (peer == NULL) {
 587             return;
 588         }
 589 
 590         uuid = pcmk__cluster_get_xml_id(peer);
 591 
 592         if (AM_I_DC) {
 593             /* The DC always sends updates */
 594             update_node_state_after_fencing(event->target, uuid);
 595 
 596             /* @TODO Ideally, at this point, we'd check whether the fenced node
 597              * hosted any guest nodes, and call remote_node_down() for them.
 598              * Unfortunately, the controller doesn't have a simple, reliable way
 599              * to map hosts to guests. It might be possible to track this in the
 600              * peer cache via refresh_remote_nodes(). For now, we rely on the
 601              * scheduler creating fence pseudo-events for the guests.
 602              */
 603 
 604             if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
 605                 /* Abort the current transition if it wasn't the cluster that
 606                  * initiated fencing.
 607                  */
 608                 crm_info("External fencing operation from %s fenced %s",
 609                          client, event->target);
 610                 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
 611                                  "External Fencing Operation", NULL);
 612             }
 613 
 614         } else if (pcmk__str_eq(controld_globals.dc_name, event->target,
 615                                 pcmk__str_null_matches|pcmk__str_casei)
 616                    && !pcmk_is_set(peer->flags, pcmk__node_status_remote)) {
 617             // Assume the target was our DC if we don't currently have one
 618 
 619             if (controld_globals.dc_name != NULL) {
 620                 crm_notice("Fencing target %s was our DC", event->target);
 621             } else {
 622                 crm_notice("Fencing target %s may have been our DC",
 623                            event->target);
 624             }
 625 
 626             /* Given the CIB resyncing that occurs around elections,
 627              * have one node update the CIB now and, if the new DC is different,
 628              * have them do so too after the election
 629              */
 630             if (controld_is_local_node(event->executioner)) {
 631                 update_node_state_after_fencing(event->target, uuid);
 632             }
 633             add_stonith_cleanup(event->target);
 634         }
 635 
 636         /* If the target is a remote node, and we host its connection,
 637          * immediately fail all monitors so it can be recovered quickly.
 638          * The connection won't necessarily drop when a remote node is fenced,
 639          * so the failure might not otherwise be detected until the next poke.
 640          */
 641         if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) {
 642             remote_ra_fail(event->target);
 643         }
 644 
 645         crmd_peer_down(peer, TRUE);
 646      }
 647 }
 648 
 649 /*!
 650  * \brief Connect to fencer
 651  *
 652  * \param[in] user_data  If NULL, retry failures now, otherwise retry in mainloop timer
 653  *
 654  * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
 655  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 656  *       30 attempts, meaning the controller could be blocked as long as 58s.
 657  */
 658 gboolean
 659 controld_timer_fencer_connect(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 660 {
 661     int rc = pcmk_ok;
 662 
 663     if (stonith_api == NULL) {
 664         stonith_api = stonith__api_new();
 665         if (stonith_api == NULL) {
 666             crm_err("Could not connect to fencer: API memory allocation failed");
 667             return G_SOURCE_REMOVE;
 668         }
 669     }
 670 
 671     if (stonith_api->state != stonith_disconnected) {
 672         crm_trace("Already connected to fencer, no need to retry");
 673         return G_SOURCE_REMOVE;
 674     }
 675 
 676     if (user_data == NULL) {
 677         // Blocking (retry failures now until successful)
 678         rc = stonith__api_connect_retry(stonith_api, crm_system_name, 30);
 679         if (rc != pcmk_rc_ok) {
 680             crm_err("Could not connect to fencer in 30 attempts: %s "
 681                     QB_XS " rc=%d", pcmk_rc_str(rc), rc);
 682         }
 683     } else {
 684         // Non-blocking (retry failures later in main loop)
 685         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 686 
 687         if (controld_fencer_connect_timer == NULL) {
 688             controld_fencer_connect_timer =
 689                 mainloop_timer_add("controld_fencer_connect", 1000,
 690                                    TRUE, controld_timer_fencer_connect,
 691                                    GINT_TO_POINTER(TRUE));
 692         }
 693 
 694         if (rc != pcmk_ok) {
 695             if (pcmk_is_set(controld_globals.fsa_input_register,
 696                             R_ST_REQUIRED)) {
 697                 crm_notice("Fencer connection failed (will retry): %s "
 698                            QB_XS " rc=%d", pcmk_strerror(rc), rc);
 699 
 700                 if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 701                     mainloop_timer_start(controld_fencer_connect_timer);
 702                 }
 703 
 704                 return G_SOURCE_CONTINUE;
 705             } else {
 706                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 707                          QB_XS " rc=%d", pcmk_strerror(rc), rc);
 708             }
 709             return G_SOURCE_REMOVE;
 710         }
 711     }
 712 
 713     if (rc == pcmk_ok) {
 714         stonith_api_operations_t *cmds = stonith_api->cmds;
 715 
 716         cmds->register_notification(stonith_api,
 717                                     PCMK__VALUE_ST_NOTIFY_DISCONNECT,
 718                                     tengine_stonith_connection_destroy);
 719         cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_FENCE,
 720                                     handle_fence_notification);
 721         cmds->register_notification(stonith_api,
 722                                     PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED,
 723                                     tengine_stonith_history_synced);
 724         te_trigger_stonith_history_sync(TRUE);
 725         crm_notice("Fencer successfully connected");
 726     }
 727 
 728     return G_SOURCE_REMOVE;
 729 }
 730 
 731 void
 732 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 733 {
 734     if (stonith_api) {
 735         // Prevent fencer connection from coming up again
 736         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 737 
 738         if (stonith_api->state != stonith_disconnected) {
 739             stonith_api->cmds->disconnect(stonith_api);
 740         }
 741         stonith_api->cmds->remove_notification(stonith_api, NULL);
 742     }
 743     if (destroy) {
 744         if (stonith_api) {
 745             stonith_api->cmds->free(stonith_api);
 746             stonith_api = NULL;
 747         }
 748         if (controld_fencer_connect_timer) {
 749             mainloop_timer_del(controld_fencer_connect_timer);
 750             controld_fencer_connect_timer = NULL;
 751         }
 752         if (te_client_id) {
 753             free(te_client_id);
 754             te_client_id = NULL;
 755         }
 756     }
 757 }
 758 
 759 static gboolean
 760 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 761 {
 762     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 763         stonith_history_t *history = NULL;
 764 
 765         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 766         stonith_api->cmds->history(stonith_api,
 767                                    st_opt_sync_call | st_opt_broadcast,
 768                                    NULL, &history, 5);
 769         stonith__history_free(history);
 770         return TRUE;
 771     } else {
 772         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 773         return FALSE;
 774     }
 775 }
 776 
 777 static void
 778 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 779 {
 780     char *uuid = NULL;
 781     int stonith_id = -1;
 782     int transition_id = -1;
 783     pcmk__graph_action_t *action = NULL;
 784     const char *target = NULL;
 785 
 786     if ((data == NULL) || (data->userdata == NULL)) {
 787         crm_err("Ignoring fence operation %d result: "
 788                 "No transition key given (bug?)",
 789                 ((data == NULL)? -1 : data->call_id));
 790         return;
 791     }
 792 
 793     if (!AM_I_DC) {
 794         const char *reason = stonith__exit_reason(data);
 795 
 796         if (reason == NULL) {
 797            reason = pcmk_exec_status_str(stonith__execution_status(data));
 798         }
 799         crm_notice("Result of fence operation %d: %d (%s) " QB_XS " key=%s",
 800                    data->call_id, stonith__exit_status(data), reason,
 801                    (const char *) data->userdata);
 802         return;
 803     }
 804 
 805     CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
 806                                     &stonith_id, NULL),
 807               goto bail);
 808 
 809     if (controld_globals.transition_graph->complete || (stonith_id < 0)
 810         || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
 811         || (controld_globals.transition_graph->id != transition_id)) {
 812         crm_info("Ignoring fence operation %d result: "
 813                  "Not from current transition " QB_XS
 814                  " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
 815                  data->call_id,
 816                  pcmk__btoa(controld_globals.transition_graph->complete),
 817                  stonith_id, uuid, controld_globals.te_uuid, transition_id,
 818                  controld_globals.transition_graph->id);
 819         goto bail;
 820     }
 821 
 822     action = controld_get_action(stonith_id);
 823     if (action == NULL) {
 824         crm_err("Ignoring fence operation %d result: "
 825                 "Action %d not found in transition graph (bug?) "
 826                 QB_XS " uuid=%s transition=%d",
 827                 data->call_id, stonith_id, uuid, transition_id);
 828         goto bail;
 829     }
 830 
 831     target = crm_element_value(action->xml, PCMK__META_ON_NODE);
 832     if (target == NULL) {
 833         crm_err("Ignoring fence operation %d result: No target given (bug?)",
 834                 data->call_id);
 835         goto bail;
 836     }
 837 
 838     stop_te_timer(action);
 839     if (stonith__exit_status(data) == CRM_EX_OK) {
 840         const char *uuid = crm_element_value(action->xml,
 841                                              PCMK__META_ON_NODE_UUID);
 842         const char *op = crm_meta_value(action->params,
 843                                         PCMK__META_STONITH_ACTION);
 844 
 845         crm_info("Fence operation %d for %s succeeded", data->call_id, target);
 846         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 847             te_action_confirmed(action, NULL);
 848             if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
 849                 const char *value = NULL;
 850                 char *now = pcmk__ttoa(time(NULL));
 851                 gboolean is_remote_node = FALSE;
 852 
 853                 /* This check is not 100% reliable, since this node is not
 854                  * guaranteed to have the remote node cached. However, it
 855                  * doesn't have to be reliable, since the attribute manager can
 856                  * learn a node's "remoteness" by other means sooner or later.
 857                  * This allows it to learn more quickly if this node does have
 858                  * the information.
 859                  */
 860                 if (g_hash_table_lookup(pcmk__remote_peer_cache,
 861                                         uuid) != NULL) {
 862                     is_remote_node = TRUE;
 863                 }
 864 
 865                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 866                              is_remote_node);
 867                 free(now);
 868 
 869                 value = crm_meta_value(action->params, PCMK__META_DIGESTS_ALL);
 870                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 871                              is_remote_node);
 872 
 873                 value = crm_meta_value(action->params,
 874                                        PCMK__META_DIGESTS_SECURE);
 875                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 876                              is_remote_node);
 877 
 878             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 879                 update_node_state_after_fencing(target, uuid);
 880                 pcmk__set_graph_action_flags(action,
 881                                              pcmk__graph_action_sent_update);
 882             }
 883         }
 884         st_fail_count_reset(target);
 885 
 886     } else {
 887         enum pcmk__graph_next abort_action = pcmk__graph_restart;
 888         int status = stonith__execution_status(data);
 889         const char *reason = stonith__exit_reason(data);
 890 
 891         if (reason == NULL) {
 892             if (status == PCMK_EXEC_DONE) {
 893                 reason = "Agent returned error";
 894             } else {
 895                 reason = pcmk_exec_status_str(status);
 896             }
 897         }
 898         pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 899 
 900         /* If no fence devices were available, there's no use in immediately
 901          * checking again, so don't start a new transition in that case.
 902          */
 903         if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
 904             crm_warn("Fence operation %d for %s failed: %s "
 905                      "(aborting transition and giving up for now)",
 906                      data->call_id, target, reason);
 907             abort_action = pcmk__graph_wait;
 908         } else {
 909             crm_notice("Fence operation %d for %s failed: %s "
 910                        "(aborting transition)", data->call_id, target, reason);
 911         }
 912 
 913         /* Increment the fail count now, so abort_for_stonith_failure() can
 914          * check it. Non-DC nodes will increment it in
 915          * handle_fence_notification().
 916          */
 917         st_fail_count_increment(target);
 918         abort_for_stonith_failure(abort_action, target, NULL);
 919     }
 920 
 921     pcmk__update_graph(controld_globals.transition_graph, action);
 922     trigger_graph();
 923 
 924   bail:
 925     free(data->userdata);
 926     free(uuid);
 927     return;
 928 }
 929 
 930 static int
 931 fence_with_delay(const char *target, const char *type, int delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 932 {
 933     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 934     int timeout_sec = pcmk__timeout_ms2s(controld_globals.transition_graph->stonith_timeout);
 935 
 936     if (crmd_join_phase_count(controld_join_confirmed) == 1) {
 937         stonith__set_call_options(options, target, st_opt_allow_self_fencing);
 938     }
 939     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 940                                                type, timeout_sec, 0, delay);
 941 }
 942 
 943 /*!
 944  * \internal
 945  * \brief Execute a fencing action from a transition graph
 946  *
 947  * \param[in] graph   Transition graph being executed (ignored)
 948  * \param[in] action  Fencing action to execute
 949  *
 950  * \return Standard Pacemaker return code
 951  */
 952 int
 953 controld_execute_fence_action(pcmk__graph_t *graph,
     /* [previous][next][first][last][top][bottom][index][help] */
 954                               pcmk__graph_action_t *action)
 955 {
 956     int rc = 0;
 957     const char *id = pcmk__xe_id(action->xml);
 958     const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID);
 959     const char *target = crm_element_value(action->xml, PCMK__META_ON_NODE);
 960     const char *type = crm_meta_value(action->params,
 961                                       PCMK__META_STONITH_ACTION);
 962     char *transition_key = NULL;
 963     const char *priority_delay = NULL;
 964     int delay_i = 0;
 965     gboolean invalid_action = FALSE;
 966     int stonith_timeout = pcmk__timeout_ms2s(controld_globals.transition_graph->stonith_timeout);
 967 
 968     CRM_CHECK(id != NULL, invalid_action = TRUE);
 969     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 970     CRM_CHECK(type != NULL, invalid_action = TRUE);
 971     CRM_CHECK(target != NULL, invalid_action = TRUE);
 972 
 973     if (invalid_action) {
 974         crm_log_xml_warn(action->xml, "BadAction");
 975         return EPROTO;
 976     }
 977 
 978     priority_delay = crm_meta_value(action->params,
 979                                     PCMK_OPT_PRIORITY_FENCING_DELAY);
 980 
 981     crm_notice("Requesting fencing (%s) targeting node %s "
 982                QB_XS " action=%s timeout=%i%s%s",
 983                type, target, id, stonith_timeout,
 984                priority_delay ? " priority_delay=" : "",
 985                priority_delay ? priority_delay : "");
 986 
 987     /* Passing NULL means block until we can connect... */
 988     controld_timer_fencer_connect(NULL);
 989 
 990     pcmk__scan_min_int(priority_delay, &delay_i, 0);
 991     rc = fence_with_delay(target, type, delay_i);
 992     transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
 993                                           action->id, 0,
 994                                           controld_globals.te_uuid),
 995     stonith_api->cmds->register_callback(stonith_api, rc,
 996                                          (stonith_timeout
 997                                           + (delay_i > 0 ? delay_i : 0)),
 998                                          st_opt_timeout_updates, transition_key,
 999                                          "tengine_stonith_callback",
1000                                          tengine_stonith_callback);
1001     return pcmk_rc_ok;
1002 }
1003 
1004 bool
1005 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
1006 {
1007     long long st_timeout = (value != NULL)? crm_get_msec(value) : 0;
1008     const char *our_nodename = controld_globals.cluster->priv->node_name;
1009 
1010     if (st_timeout == 0
1011         || (stonith_api && (stonith_api->state != stonith_disconnected) &&
1012             stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
1013                                                            our_nodename))) {
1014         return pcmk__valid_stonith_watchdog_timeout(value);
1015     }
1016     return true;
1017 }
1018 
1019 /* end stonith API client functions */
1020 
1021 
1022 /*
1023  * stonith history synchronization
1024  *
1025  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
1026  * joins or leaves, we need to synchronize the history across all nodes.
1027  */
1028 
1029 static crm_trigger_t *stonith_history_sync_trigger = NULL;
1030 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
1031 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
1032 
1033 void
1034 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
1035 {
1036     if (free_timers) {
1037         mainloop_timer_del(stonith_history_sync_timer_short);
1038         stonith_history_sync_timer_short = NULL;
1039         mainloop_timer_del(stonith_history_sync_timer_long);
1040         stonith_history_sync_timer_long = NULL;
1041     } else {
1042         mainloop_timer_stop(stonith_history_sync_timer_short);
1043         mainloop_timer_stop(stonith_history_sync_timer_long);
1044     }
1045 
1046     if (st) {
1047         st->cmds->remove_notification(st, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED);
1048     }
1049 }
1050 
1051 static void
1052 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
1053 {
1054     te_cleanup_stonith_history_sync(st, FALSE);
1055     crm_debug("Fence-history synced - cancel all timers");
1056 }
1057 
1058 static gboolean
1059 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1060 {
1061     mainloop_set_trigger(stonith_history_sync_trigger);
1062     return FALSE;
1063 }
1064 
1065 void
1066 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1067 {
1068     /* trigger a sync in 5s to give more nodes the
1069      * chance to show up so that we don't create
1070      * unnecessary stonith-history-sync traffic
1071      *
1072      * the long timeout of 30s is there as a fallback
1073      * so that after a successful connection to fenced
1074      * we will wait for 30s for the DC to trigger a
1075      * history-sync
1076      * if this doesn't happen we trigger a sync locally
1077      * (e.g. fenced segfaults and is restarted by pacemakerd)
1078      */
1079 
1080     /* as we are finally checking the stonith-connection
1081      * in do_stonith_history_sync we should be fine
1082      * leaving stonith_history_sync_time & stonith_history_sync_trigger
1083      * around
1084      */
1085     if (stonith_history_sync_trigger == NULL) {
1086         stonith_history_sync_trigger =
1087             mainloop_add_trigger(G_PRIORITY_LOW,
1088                                  do_stonith_history_sync, NULL);
1089     }
1090 
1091     if (long_timeout) {
1092         if(stonith_history_sync_timer_long == NULL) {
1093             stonith_history_sync_timer_long =
1094                 mainloop_timer_add("history_sync_long", 30000,
1095                                    FALSE, stonith_history_sync_set_trigger,
1096                                    NULL);
1097         }
1098         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1099         mainloop_timer_start(stonith_history_sync_timer_long);
1100     } else {
1101         if(stonith_history_sync_timer_short == NULL) {
1102             stonith_history_sync_timer_short =
1103                 mainloop_timer_add("history_sync_short", 5000,
1104                                    FALSE, stonith_history_sync_set_trigger,
1105                                    NULL);
1106         }
1107         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1108         mainloop_timer_start(stonith_history_sync_timer_short);
1109     }
1110 
1111 }
1112 
1113 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */