root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. controld_configure_fencing
  4. too_many_st_failures
  5. st_fail_count_reset
  6. st_fail_count_increment
  7. cib_fencing_updated
  8. send_stonith_update
  9. abort_for_stonith_failure
  10. add_stonith_cleanup
  11. remove_stonith_cleanup
  12. purge_stonith_cleanup
  13. execute_stonith_cleanup
  14. fail_incompletable_stonith
  15. tengine_stonith_connection_destroy
  16. handle_fence_notification
  17. controld_timer_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. controld_execute_fence_action
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/common/xml.h>
  13 #include <crm/stonith-ng.h>
  14 #include <crm/fencing/internal.h>
  15 
  16 #include <pacemaker-controld.h>
  17 
  18 static void
  19 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  20 
  21 /*
  22  * stonith failure counting
  23  *
  24  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  25  * number of fencing failures for each target node, and the most we'll restart a
  26  * transition for.
  27  */
  28 
  29 struct st_fail_rec {
  30     int count;
  31 };
  32 
  33 #define DEFAULT_STONITH_MAX_ATTEMPTS 10
  34 
  35 static bool fence_reaction_panic = false;
  36 static unsigned long int stonith_max_attempts = DEFAULT_STONITH_MAX_ATTEMPTS;
  37 static GHashTable *stonith_failures = NULL;
  38 
  39 /*!
  40  * \internal
  41  * \brief Update max fencing attempts before giving up
  42  *
  43  * \param[in] value  New max fencing attempts
  44  */
  45 static void
  46 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  47 {
  48     int score = 0;
  49     int rc = pcmk_parse_score(value, &score, DEFAULT_STONITH_MAX_ATTEMPTS);
  50 
  51     // The option validator ensures invalid values shouldn't be possible
  52     CRM_CHECK((rc == pcmk_rc_ok) && (score > 0), return);
  53 
  54     if (stonith_max_attempts != score) {
  55         crm_debug("Maximum fencing attempts per transition is now %d (was %lu)",
  56                   score, stonith_max_attempts);
  57     }
  58     stonith_max_attempts = score;
  59 }
  60 
  61 /*!
  62  * \internal
  63  * \brief Configure reaction to notification of local node being fenced
  64  *
  65  * \param[in] reaction_s  Reaction type
  66  */
  67 static void
  68 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  69 {
  70     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  71         fence_reaction_panic = true;
  72 
  73     } else {
  74         if (!pcmk__str_eq(reaction_s, PCMK_VALUE_STOP, pcmk__str_casei)) {
  75             crm_warn("Invalid value '%s' for %s, using 'stop'",
  76                      reaction_s, PCMK_OPT_FENCE_REACTION);
  77         }
  78         fence_reaction_panic = false;
  79     }
  80 }
  81 
  82 /*!
  83  * \internal
  84  * \brief Configure fencing options based on the CIB
  85  *
  86  * \param[in,out] options  Name/value pairs for configured options
  87  */
  88 void
  89 controld_configure_fencing(GHashTable *options)
     /* [previous][next][first][last][top][bottom][index][help] */
  90 {
  91     const char *value = NULL;
  92 
  93     value = g_hash_table_lookup(options, PCMK_OPT_FENCE_REACTION);
  94     set_fence_reaction(value);
  95 
  96     value = g_hash_table_lookup(options, PCMK_OPT_STONITH_MAX_ATTEMPTS);
  97     update_stonith_max_attempts(value);
  98 }
  99 
 100 static gboolean
 101 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 102 {
 103     GHashTableIter iter;
 104     const char *key = NULL;
 105     struct st_fail_rec *value = NULL;
 106 
 107     if (stonith_failures == NULL) {
 108         return FALSE;
 109     }
 110 
 111     if (target == NULL) {
 112         g_hash_table_iter_init(&iter, stonith_failures);
 113         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 114                (gpointer *) &value)) {
 115 
 116             if (value->count >= stonith_max_attempts) {
 117                 target = (const char*)key;
 118                 goto too_many;
 119             }
 120         }
 121     } else {
 122         value = g_hash_table_lookup(stonith_failures, target);
 123         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
 124             goto too_many;
 125         }
 126     }
 127     return FALSE;
 128 
 129 too_many:
 130     crm_warn("Too many failures (%d) to fence %s, giving up",
 131              value->count, target);
 132     return TRUE;
 133 }
 134 
 135 /*!
 136  * \internal
 137  * \brief Reset a stonith fail count
 138  *
 139  * \param[in] target  Name of node to reset, or NULL for all
 140  */
 141 void
 142 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 143 {
 144     if (stonith_failures == NULL) {
 145         return;
 146     }
 147 
 148     if (target) {
 149         struct st_fail_rec *rec = NULL;
 150 
 151         rec = g_hash_table_lookup(stonith_failures, target);
 152         if (rec) {
 153             rec->count = 0;
 154         }
 155     } else {
 156         GHashTableIter iter;
 157         const char *key = NULL;
 158         struct st_fail_rec *rec = NULL;
 159 
 160         g_hash_table_iter_init(&iter, stonith_failures);
 161         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 162                                       (gpointer *) &rec)) {
 163             rec->count = 0;
 164         }
 165     }
 166 }
 167 
 168 static void
 169 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 170 {
 171     struct st_fail_rec *rec = NULL;
 172 
 173     if (stonith_failures == NULL) {
 174         stonith_failures = pcmk__strkey_table(free, free);
 175     }
 176 
 177     rec = g_hash_table_lookup(stonith_failures, target);
 178     if (rec) {
 179         rec->count++;
 180     } else {
 181         rec = malloc(sizeof(struct st_fail_rec));
 182         if(rec == NULL) {
 183             return;
 184         }
 185 
 186         rec->count = 1;
 187         g_hash_table_insert(stonith_failures, pcmk__str_copy(target), rec);
 188     }
 189 }
 190 
 191 /* end stonith fail count functions */
 192 
 193 
 194 static void
 195 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 196                     void *user_data)
 197 {
 198     if (rc < pcmk_ok) {
 199         crm_err("Fencing update %d for %s: failed - %s (%d)",
 200                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 201         crm_log_xml_warn(msg, "Failed update");
 202         abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_shutdown,
 203                          "CIB update failed", NULL);
 204 
 205     } else {
 206         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 207     }
 208 }
 209 
 210 static void
 211 send_stonith_update(pcmk__graph_action_t *action, const char *target,
     /* [previous][next][first][last][top][bottom][index][help] */
 212                     const char *uuid)
 213 {
 214     int rc = pcmk_ok;
 215     pcmk__node_status_t *peer = NULL;
 216 
 217     /* We (usually) rely on the membership layer to do node_update_cluster,
 218      * and the peer status callback to do node_update_peer, because the node
 219      * might have already rejoined before we get the stonith result here.
 220      */
 221     int flags = node_update_join | node_update_expected;
 222 
 223     /* zero out the node-status & remove all LRM status info */
 224     xmlNode *node_state = NULL;
 225 
 226     CRM_CHECK(target != NULL, return);
 227     CRM_CHECK(uuid != NULL, return);
 228 
 229     /* Make sure the membership and join caches are accurate.
 230      * Try getting any existing node cache entry also by node uuid in case it
 231      * doesn't have an uname yet.
 232      */
 233     peer = pcmk__get_node(0, target, uuid, pcmk__node_search_any);
 234 
 235     CRM_CHECK(peer != NULL, return);
 236 
 237     if (peer->state == NULL) {
 238         /* Usually, we rely on the membership layer to update the cluster state
 239          * in the CIB. However, if the node has never been seen, do it here, so
 240          * the node is not considered unclean.
 241          */
 242         flags |= node_update_cluster;
 243     }
 244 
 245     if (peer->xml_id == NULL) {
 246         crm_info("Recording XML ID '%s' for node '%s'", uuid, target);
 247         peer->xml_id = pcmk__str_copy(uuid);
 248     }
 249 
 250     crmd_peer_down(peer, TRUE);
 251 
 252     /* Generate a node state update for the CIB */
 253     node_state = create_node_state_update(peer, flags, NULL, __func__);
 254 
 255     /* we have to mark whether or not remote nodes have already been fenced */
 256     if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) {
 257         char *now_s = pcmk__ttoa(time(NULL));
 258 
 259         crm_xml_add(node_state, PCMK__XA_NODE_FENCED, now_s);
 260         free(now_s);
 261     }
 262 
 263     /* Force our known ID */
 264     crm_xml_add(node_state, PCMK_XA_ID, uuid);
 265 
 266     rc = controld_globals.cib_conn->cmds->modify(controld_globals.cib_conn,
 267                                                  PCMK_XE_STATUS, node_state,
 268                                                  cib_can_create);
 269 
 270     /* Delay processing the trigger until the update completes */
 271     crm_debug("Sending fencing update %d for %s", rc, target);
 272     fsa_register_cib_callback(rc, pcmk__str_copy(target), cib_fencing_updated);
 273 
 274     // Make sure it sticks
 275     /* controld_globals.cib_conn->cmds->bump_epoch(controld_globals.cib_conn,
 276      *                                             cib_none);
 277      */
 278 
 279     controld_delete_node_state(peer->name, controld_section_all, cib_none);
 280     pcmk__xml_free(node_state);
 281     return;
 282 }
 283 
 284 /*!
 285  * \internal
 286  * \brief Abort transition due to stonith failure
 287  *
 288  * \param[in] abort_action  Whether to restart or stop transition
 289  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 290  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 291  */
 292 static void
 293 abort_for_stonith_failure(enum pcmk__graph_next abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 294                           const char *target, const xmlNode *reason)
 295 {
 296     /* If stonith repeatedly fails, we eventually give up on starting a new
 297      * transition for that reason.
 298      */
 299     if ((abort_action != pcmk__graph_wait) && too_many_st_failures(target)) {
 300         abort_action = pcmk__graph_wait;
 301     }
 302     abort_transition(PCMK_SCORE_INFINITY, abort_action, "Stonith failed",
 303                      reason);
 304 }
 305 
 306 
 307 /*
 308  * stonith cleanup list
 309  *
 310  * If the DC is shot, proper notifications might not go out.
 311  * The stonith cleanup list allows the cluster to (re-)send
 312  * notifications once a new DC is elected.
 313  */
 314 
 315 static GList *stonith_cleanup_list = NULL;
 316 
 317 /*!
 318  * \internal
 319  * \brief Add a node to the stonith cleanup list
 320  *
 321  * \param[in] target  Name of node to add
 322  */
 323 void
 324 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 325     stonith_cleanup_list = g_list_append(stonith_cleanup_list,
 326                                          pcmk__str_copy(target));
 327 }
 328 
 329 /*!
 330  * \internal
 331  * \brief Remove a node from the stonith cleanup list
 332  *
 333  * \param[in] Name of node to remove
 334  */
 335 void
 336 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 337 {
 338     GList *iter = stonith_cleanup_list;
 339 
 340     while (iter != NULL) {
 341         GList *tmp = iter;
 342         char *iter_name = tmp->data;
 343 
 344         iter = iter->next;
 345         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 346             crm_trace("Removing %s from the cleanup list", iter_name);
 347             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 348             free(iter_name);
 349         }
 350     }
 351 }
 352 
 353 /*!
 354  * \internal
 355  * \brief Purge all entries from the stonith cleanup list
 356  */
 357 void
 358 purge_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 359 {
 360     if (stonith_cleanup_list) {
 361         GList *iter = NULL;
 362 
 363         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 364             char *target = iter->data;
 365 
 366             crm_info("Purging %s from stonith cleanup list", target);
 367             free(target);
 368         }
 369         g_list_free(stonith_cleanup_list);
 370         stonith_cleanup_list = NULL;
 371     }
 372 }
 373 
 374 /*!
 375  * \internal
 376  * \brief Send stonith updates for all entries in cleanup list, then purge it
 377  */
 378 void
 379 execute_stonith_cleanup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 380 {
 381     GList *iter;
 382 
 383     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 384         char *target = iter->data;
 385         pcmk__node_status_t *target_node =
 386             pcmk__get_node(0, target, NULL, pcmk__node_search_cluster_member);
 387         const char *uuid = pcmk__cluster_node_uuid(target_node);
 388 
 389         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 390         send_stonith_update(NULL, target, uuid);
 391         free(target);
 392     }
 393     g_list_free(stonith_cleanup_list);
 394     stonith_cleanup_list = NULL;
 395 }
 396 
 397 /* end stonith cleanup list functions */
 398 
 399 
 400 /* stonith API client
 401  *
 402  * Functions that need to interact directly with the fencer via its API
 403  */
 404 
 405 static stonith_t *stonith_api = NULL;
 406 static mainloop_timer_t *controld_fencer_connect_timer = NULL;
 407 static char *te_client_id = NULL;
 408 
 409 static gboolean
 410 fail_incompletable_stonith(pcmk__graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 411 {
 412     GList *lpc = NULL;
 413     const char *task = NULL;
 414     xmlNode *last_action = NULL;
 415 
 416     if (graph == NULL) {
 417         return FALSE;
 418     }
 419 
 420     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 421         GList *lpc2 = NULL;
 422         pcmk__graph_synapse_t *synapse = (pcmk__graph_synapse_t *) lpc->data;
 423 
 424         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 425             continue;
 426         }
 427 
 428         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 429             pcmk__graph_action_t *action = (pcmk__graph_action_t *) lpc2->data;
 430 
 431             if ((action->type != pcmk__cluster_graph_action)
 432                 || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 433                 continue;
 434             }
 435 
 436             task = crm_element_value(action->xml, PCMK_XA_OPERATION);
 437             if (pcmk__str_eq(task, PCMK_ACTION_STONITH, pcmk__str_casei)) {
 438                 pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 439                 last_action = action->xml;
 440                 pcmk__update_graph(graph, action);
 441                 crm_notice("Failing action %d (%s): fencer terminated",
 442                            action->id, pcmk__xe_id(action->xml));
 443             }
 444         }
 445     }
 446 
 447     if (last_action != NULL) {
 448         crm_warn("Fencer failure resulted in unrunnable actions");
 449         abort_for_stonith_failure(pcmk__graph_restart, NULL, last_action);
 450         return TRUE;
 451     }
 452 
 453     return FALSE;
 454 }
 455 
 456 static void
 457 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 458 {
 459     te_cleanup_stonith_history_sync(st, FALSE);
 460 
 461     if (pcmk_is_set(controld_globals.fsa_input_register, R_ST_REQUIRED)) {
 462         crm_err("Lost fencer connection (will attempt to reconnect)");
 463         if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 464             mainloop_timer_start(controld_fencer_connect_timer);
 465         }
 466     } else {
 467         crm_info("Disconnected from fencer");
 468     }
 469 
 470     if (stonith_api) {
 471         /* the client API won't properly reconnect notifications
 472          * if they are still in the table - so remove them
 473          */
 474         if (stonith_api->state != stonith_disconnected) {
 475             stonith_api->cmds->disconnect(st);
 476         }
 477         stonith_api->cmds->remove_notification(stonith_api, NULL);
 478     }
 479 
 480     if (AM_I_DC) {
 481         fail_incompletable_stonith(controld_globals.transition_graph);
 482         trigger_graph();
 483     }
 484 }
 485 
 486 /*!
 487  * \internal
 488  * \brief Handle an event notification from the fencing API
 489  *
 490  * \param[in] st     Fencing API connection (ignored)
 491  * \param[in] event  Fencing API event notification
 492  */
 493 static void
 494 handle_fence_notification(stonith_t *st, stonith_event_t *event)
     /* [previous][next][first][last][top][bottom][index][help] */
 495 {
 496     bool succeeded = true;
 497     const char *executioner = "the cluster";
 498     const char *client = "a client";
 499     const char *reason = NULL;
 500     int exec_status;
 501 
 502     if (te_client_id == NULL) {
 503         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 504                                          (unsigned long) getpid());
 505     }
 506 
 507     if (event == NULL) {
 508         crm_err("Notify data not found");
 509         return;
 510     }
 511 
 512     if (event->executioner != NULL) {
 513         executioner = event->executioner;
 514     }
 515     if (event->client_origin != NULL) {
 516         client = event->client_origin;
 517     }
 518 
 519     exec_status = stonith__event_execution_status(event);
 520     if ((stonith__event_exit_status(event) != CRM_EX_OK)
 521         || (exec_status != PCMK_EXEC_DONE)) {
 522         succeeded = false;
 523         if (exec_status == PCMK_EXEC_DONE) {
 524             exec_status = PCMK_EXEC_ERROR;
 525         }
 526     }
 527     reason = stonith__event_exit_reason(event);
 528 
 529     crmd_alert_fencing_op(event);
 530 
 531     if (pcmk__str_eq(PCMK_ACTION_ON, event->action, pcmk__str_none)) {
 532         // Unfencing doesn't need special handling, just a log message
 533         if (succeeded) {
 534             crm_notice("%s was unfenced by %s at the request of %s@%s",
 535                        event->target, executioner, client, event->origin);
 536         } else {
 537             crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
 538                     event->target, executioner,
 539                     pcmk_exec_status_str(exec_status),
 540                     ((reason == NULL)? "" : ": "),
 541                     ((reason == NULL)? "" : reason),
 542                     stonith__event_exit_status(event));
 543         }
 544         return;
 545     }
 546 
 547     if (succeeded && controld_is_local_node(event->target)) {
 548         /* We were notified of our own fencing. Most likely, either fencing was
 549          * misconfigured, or fabric fencing that doesn't cut cluster
 550          * communication is in use.
 551          *
 552          * Either way, shutting down the local host is a good idea, to require
 553          * administrator intervention. Also, other nodes would otherwise likely
 554          * set our status to lost because of the fencing callback and discard
 555          * our subsequent election votes as "not part of our cluster".
 556          */
 557         crm_crit("We were allegedly just fenced by %s for %s!",
 558                  executioner, event->origin); // Dumps blackbox if enabled
 559         if (fence_reaction_panic) {
 560             pcmk__panic("Notified of own fencing");
 561         } else {
 562             crm_exit(CRM_EX_FATAL);
 563         }
 564         return; // Should never get here
 565     }
 566 
 567     /* Update the count of fencing failures for this target, in case we become
 568      * DC later. The current DC has already updated its fail count in
 569      * tengine_stonith_callback().
 570      */
 571     if (!AM_I_DC) {
 572         if (succeeded) {
 573             st_fail_count_reset(event->target);
 574         } else {
 575             st_fail_count_increment(event->target);
 576         }
 577     }
 578 
 579     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
 580                "%s%s%s%s " QB_XS " event=%s",
 581                event->target, (succeeded? "" : " not"),
 582                event->action, executioner, client, event->origin,
 583                (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
 584                ((reason == NULL)? "" : " ("),
 585                ((reason == NULL)? "" : reason),
 586                ((reason == NULL)? "" : ")"),
 587                event->id);
 588 
 589     if (succeeded) {
 590         const uint32_t flags = pcmk__node_search_any
 591                                |pcmk__node_search_cluster_cib;
 592 
 593         pcmk__node_status_t *peer = pcmk__search_node_caches(0, event->target,
 594                                                              flags);
 595         const char *uuid = NULL;
 596 
 597         if (peer == NULL) {
 598             return;
 599         }
 600 
 601         uuid = pcmk__cluster_node_uuid(peer);
 602 
 603         if (AM_I_DC) {
 604             /* The DC always sends updates */
 605             send_stonith_update(NULL, event->target, uuid);
 606 
 607             /* @TODO Ideally, at this point, we'd check whether the fenced node
 608              * hosted any guest nodes, and call remote_node_down() for them.
 609              * Unfortunately, the controller doesn't have a simple, reliable way
 610              * to map hosts to guests. It might be possible to track this in the
 611              * peer cache via refresh_remote_nodes(). For now, we rely on the
 612              * scheduler creating fence pseudo-events for the guests.
 613              */
 614 
 615             if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
 616                 /* Abort the current transition if it wasn't the cluster that
 617                  * initiated fencing.
 618                  */
 619                 crm_info("External fencing operation from %s fenced %s",
 620                          client, event->target);
 621                 abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
 622                                  "External Fencing Operation", NULL);
 623             }
 624 
 625         } else if (pcmk__str_eq(controld_globals.dc_name, event->target,
 626                                 pcmk__str_null_matches|pcmk__str_casei)
 627                    && !pcmk_is_set(peer->flags, pcmk__node_status_remote)) {
 628             // Assume the target was our DC if we don't currently have one
 629 
 630             if (controld_globals.dc_name != NULL) {
 631                 crm_notice("Fencing target %s was our DC", event->target);
 632             } else {
 633                 crm_notice("Fencing target %s may have been our DC",
 634                            event->target);
 635             }
 636 
 637             /* Given the CIB resyncing that occurs around elections,
 638              * have one node update the CIB now and, if the new DC is different,
 639              * have them do so too after the election
 640              */
 641             if (controld_is_local_node(event->executioner)) {
 642                 send_stonith_update(NULL, event->target, uuid);
 643             }
 644             add_stonith_cleanup(event->target);
 645         }
 646 
 647         /* If the target is a remote node, and we host its connection,
 648          * immediately fail all monitors so it can be recovered quickly.
 649          * The connection won't necessarily drop when a remote node is fenced,
 650          * so the failure might not otherwise be detected until the next poke.
 651          */
 652         if (pcmk_is_set(peer->flags, pcmk__node_status_remote)) {
 653             remote_ra_fail(event->target);
 654         }
 655 
 656         crmd_peer_down(peer, TRUE);
 657      }
 658 }
 659 
 660 /*!
 661  * \brief Connect to fencer
 662  *
 663  * \param[in] user_data  If NULL, retry failures now, otherwise retry in mainloop timer
 664  *
 665  * \return G_SOURCE_REMOVE on success, G_SOURCE_CONTINUE to retry
 666  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 667  *       30 attempts, meaning the controller could be blocked as long as 58s.
 668  */
 669 gboolean
 670 controld_timer_fencer_connect(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 671 {
 672     int rc = pcmk_ok;
 673 
 674     if (stonith_api == NULL) {
 675         stonith_api = stonith_api_new();
 676         if (stonith_api == NULL) {
 677             crm_err("Could not connect to fencer: API memory allocation failed");
 678             return G_SOURCE_REMOVE;
 679         }
 680     }
 681 
 682     if (stonith_api->state != stonith_disconnected) {
 683         crm_trace("Already connected to fencer, no need to retry");
 684         return G_SOURCE_REMOVE;
 685     }
 686 
 687     if (user_data == NULL) {
 688         // Blocking (retry failures now until successful)
 689         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 690         if (rc != pcmk_ok) {
 691             crm_err("Could not connect to fencer in 30 attempts: %s "
 692                     QB_XS " rc=%d", pcmk_strerror(rc), rc);
 693         }
 694     } else {
 695         // Non-blocking (retry failures later in main loop)
 696         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 697 
 698         if (controld_fencer_connect_timer == NULL) {
 699             controld_fencer_connect_timer =
 700                 mainloop_timer_add("controld_fencer_connect", 1000,
 701                                    TRUE, controld_timer_fencer_connect,
 702                                    GINT_TO_POINTER(TRUE));
 703         }
 704 
 705         if (rc != pcmk_ok) {
 706             if (pcmk_is_set(controld_globals.fsa_input_register,
 707                             R_ST_REQUIRED)) {
 708                 crm_notice("Fencer connection failed (will retry): %s "
 709                            QB_XS " rc=%d", pcmk_strerror(rc), rc);
 710 
 711                 if (!mainloop_timer_running(controld_fencer_connect_timer)) {
 712                     mainloop_timer_start(controld_fencer_connect_timer);
 713                 }
 714 
 715                 return G_SOURCE_CONTINUE;
 716             } else {
 717                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 718                          QB_XS " rc=%d", pcmk_strerror(rc), rc);
 719             }
 720             return G_SOURCE_REMOVE;
 721         }
 722     }
 723 
 724     if (rc == pcmk_ok) {
 725         stonith_api_operations_t *cmds = stonith_api->cmds;
 726 
 727         cmds->register_notification(stonith_api,
 728                                     PCMK__VALUE_ST_NOTIFY_DISCONNECT,
 729                                     tengine_stonith_connection_destroy);
 730         cmds->register_notification(stonith_api, PCMK__VALUE_ST_NOTIFY_FENCE,
 731                                     handle_fence_notification);
 732         cmds->register_notification(stonith_api,
 733                                     PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED,
 734                                     tengine_stonith_history_synced);
 735         te_trigger_stonith_history_sync(TRUE);
 736         crm_notice("Fencer successfully connected");
 737     }
 738 
 739     return G_SOURCE_REMOVE;
 740 }
 741 
 742 void
 743 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 744 {
 745     if (stonith_api) {
 746         // Prevent fencer connection from coming up again
 747         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 748 
 749         if (stonith_api->state != stonith_disconnected) {
 750             stonith_api->cmds->disconnect(stonith_api);
 751         }
 752         stonith_api->cmds->remove_notification(stonith_api, NULL);
 753     }
 754     if (destroy) {
 755         if (stonith_api) {
 756             stonith_api->cmds->free(stonith_api);
 757             stonith_api = NULL;
 758         }
 759         if (controld_fencer_connect_timer) {
 760             mainloop_timer_del(controld_fencer_connect_timer);
 761             controld_fencer_connect_timer = NULL;
 762         }
 763         if (te_client_id) {
 764             free(te_client_id);
 765             te_client_id = NULL;
 766         }
 767     }
 768 }
 769 
 770 static gboolean
 771 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 772 {
 773     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 774         stonith_history_t *history = NULL;
 775 
 776         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 777         stonith_api->cmds->history(stonith_api,
 778                                    st_opt_sync_call | st_opt_broadcast,
 779                                    NULL, &history, 5);
 780         stonith_history_free(history);
 781         return TRUE;
 782     } else {
 783         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 784         return FALSE;
 785     }
 786 }
 787 
 788 static void
 789 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 790 {
 791     char *uuid = NULL;
 792     int stonith_id = -1;
 793     int transition_id = -1;
 794     pcmk__graph_action_t *action = NULL;
 795     const char *target = NULL;
 796 
 797     if ((data == NULL) || (data->userdata == NULL)) {
 798         crm_err("Ignoring fence operation %d result: "
 799                 "No transition key given (bug?)",
 800                 ((data == NULL)? -1 : data->call_id));
 801         return;
 802     }
 803 
 804     if (!AM_I_DC) {
 805         const char *reason = stonith__exit_reason(data);
 806 
 807         if (reason == NULL) {
 808            reason = pcmk_exec_status_str(stonith__execution_status(data));
 809         }
 810         crm_notice("Result of fence operation %d: %d (%s) " QB_XS " key=%s",
 811                    data->call_id, stonith__exit_status(data), reason,
 812                    (const char *) data->userdata);
 813         return;
 814     }
 815 
 816     CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
 817                                     &stonith_id, NULL),
 818               goto bail);
 819 
 820     if (controld_globals.transition_graph->complete || (stonith_id < 0)
 821         || !pcmk__str_eq(uuid, controld_globals.te_uuid, pcmk__str_none)
 822         || (controld_globals.transition_graph->id != transition_id)) {
 823         crm_info("Ignoring fence operation %d result: "
 824                  "Not from current transition " QB_XS
 825                  " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
 826                  data->call_id,
 827                  pcmk__btoa(controld_globals.transition_graph->complete),
 828                  stonith_id, uuid, controld_globals.te_uuid, transition_id,
 829                  controld_globals.transition_graph->id);
 830         goto bail;
 831     }
 832 
 833     action = controld_get_action(stonith_id);
 834     if (action == NULL) {
 835         crm_err("Ignoring fence operation %d result: "
 836                 "Action %d not found in transition graph (bug?) "
 837                 QB_XS " uuid=%s transition=%d",
 838                 data->call_id, stonith_id, uuid, transition_id);
 839         goto bail;
 840     }
 841 
 842     target = crm_element_value(action->xml, PCMK__META_ON_NODE);
 843     if (target == NULL) {
 844         crm_err("Ignoring fence operation %d result: No target given (bug?)",
 845                 data->call_id);
 846         goto bail;
 847     }
 848 
 849     stop_te_timer(action);
 850     if (stonith__exit_status(data) == CRM_EX_OK) {
 851         const char *uuid = crm_element_value(action->xml,
 852                                              PCMK__META_ON_NODE_UUID);
 853         const char *op = crm_meta_value(action->params,
 854                                         PCMK__META_STONITH_ACTION);
 855 
 856         crm_info("Fence operation %d for %s succeeded", data->call_id, target);
 857         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 858             te_action_confirmed(action, NULL);
 859             if (pcmk__str_eq(PCMK_ACTION_ON, op, pcmk__str_casei)) {
 860                 const char *value = NULL;
 861                 char *now = pcmk__ttoa(time(NULL));
 862                 gboolean is_remote_node = FALSE;
 863 
 864                 /* This check is not 100% reliable, since this node is not
 865                  * guaranteed to have the remote node cached. However, it
 866                  * doesn't have to be reliable, since the attribute manager can
 867                  * learn a node's "remoteness" by other means sooner or later.
 868                  * This allows it to learn more quickly if this node does have
 869                  * the information.
 870                  */
 871                 if (g_hash_table_lookup(pcmk__remote_peer_cache,
 872                                         uuid) != NULL) {
 873                     is_remote_node = TRUE;
 874                 }
 875 
 876                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 877                              is_remote_node);
 878                 free(now);
 879 
 880                 value = crm_meta_value(action->params, PCMK__META_DIGESTS_ALL);
 881                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 882                              is_remote_node);
 883 
 884                 value = crm_meta_value(action->params,
 885                                        PCMK__META_DIGESTS_SECURE);
 886                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 887                              is_remote_node);
 888 
 889             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 890                 send_stonith_update(action, target, uuid);
 891                 pcmk__set_graph_action_flags(action,
 892                                              pcmk__graph_action_sent_update);
 893             }
 894         }
 895         st_fail_count_reset(target);
 896 
 897     } else {
 898         enum pcmk__graph_next abort_action = pcmk__graph_restart;
 899         int status = stonith__execution_status(data);
 900         const char *reason = stonith__exit_reason(data);
 901 
 902         if (reason == NULL) {
 903             if (status == PCMK_EXEC_DONE) {
 904                 reason = "Agent returned error";
 905             } else {
 906                 reason = pcmk_exec_status_str(status);
 907             }
 908         }
 909         pcmk__set_graph_action_flags(action, pcmk__graph_action_failed);
 910 
 911         /* If no fence devices were available, there's no use in immediately
 912          * checking again, so don't start a new transition in that case.
 913          */
 914         if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
 915             crm_warn("Fence operation %d for %s failed: %s "
 916                      "(aborting transition and giving up for now)",
 917                      data->call_id, target, reason);
 918             abort_action = pcmk__graph_wait;
 919         } else {
 920             crm_notice("Fence operation %d for %s failed: %s "
 921                        "(aborting transition)", data->call_id, target, reason);
 922         }
 923 
 924         /* Increment the fail count now, so abort_for_stonith_failure() can
 925          * check it. Non-DC nodes will increment it in
 926          * handle_fence_notification().
 927          */
 928         st_fail_count_increment(target);
 929         abort_for_stonith_failure(abort_action, target, NULL);
 930     }
 931 
 932     pcmk__update_graph(controld_globals.transition_graph, action);
 933     trigger_graph();
 934 
 935   bail:
 936     free(data->userdata);
 937     free(uuid);
 938     return;
 939 }
 940 
 941 static int
 942 fence_with_delay(const char *target, const char *type, int delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 943 {
 944     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 945     int timeout_sec = pcmk__timeout_ms2s(controld_globals.transition_graph->stonith_timeout);
 946 
 947     if (crmd_join_phase_count(controld_join_confirmed) == 1) {
 948         stonith__set_call_options(options, target, st_opt_allow_self_fencing);
 949     }
 950     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 951                                                type, timeout_sec, 0, delay);
 952 }
 953 
 954 /*!
 955  * \internal
 956  * \brief Execute a fencing action from a transition graph
 957  *
 958  * \param[in] graph   Transition graph being executed (ignored)
 959  * \param[in] action  Fencing action to execute
 960  *
 961  * \return Standard Pacemaker return code
 962  */
 963 int
 964 controld_execute_fence_action(pcmk__graph_t *graph,
     /* [previous][next][first][last][top][bottom][index][help] */
 965                               pcmk__graph_action_t *action)
 966 {
 967     int rc = 0;
 968     const char *id = pcmk__xe_id(action->xml);
 969     const char *uuid = crm_element_value(action->xml, PCMK__META_ON_NODE_UUID);
 970     const char *target = crm_element_value(action->xml, PCMK__META_ON_NODE);
 971     const char *type = crm_meta_value(action->params,
 972                                       PCMK__META_STONITH_ACTION);
 973     char *transition_key = NULL;
 974     const char *priority_delay = NULL;
 975     int delay_i = 0;
 976     gboolean invalid_action = FALSE;
 977     int stonith_timeout = pcmk__timeout_ms2s(controld_globals.transition_graph->stonith_timeout);
 978 
 979     CRM_CHECK(id != NULL, invalid_action = TRUE);
 980     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 981     CRM_CHECK(type != NULL, invalid_action = TRUE);
 982     CRM_CHECK(target != NULL, invalid_action = TRUE);
 983 
 984     if (invalid_action) {
 985         crm_log_xml_warn(action->xml, "BadAction");
 986         return EPROTO;
 987     }
 988 
 989     priority_delay = crm_meta_value(action->params,
 990                                     PCMK_OPT_PRIORITY_FENCING_DELAY);
 991 
 992     crm_notice("Requesting fencing (%s) targeting node %s "
 993                QB_XS " action=%s timeout=%i%s%s",
 994                type, target, id, stonith_timeout,
 995                priority_delay ? " priority_delay=" : "",
 996                priority_delay ? priority_delay : "");
 997 
 998     /* Passing NULL means block until we can connect... */
 999     controld_timer_fencer_connect(NULL);
1000 
1001     pcmk__scan_min_int(priority_delay, &delay_i, 0);
1002     rc = fence_with_delay(target, type, delay_i);
1003     transition_key = pcmk__transition_key(controld_globals.transition_graph->id,
1004                                           action->id, 0,
1005                                           controld_globals.te_uuid),
1006     stonith_api->cmds->register_callback(stonith_api, rc,
1007                                          (stonith_timeout
1008                                           + (delay_i > 0 ? delay_i : 0)),
1009                                          st_opt_timeout_updates, transition_key,
1010                                          "tengine_stonith_callback",
1011                                          tengine_stonith_callback);
1012     return pcmk_rc_ok;
1013 }
1014 
1015 bool
1016 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
1017 {
1018     long long st_timeout = (value != NULL)? crm_get_msec(value) : 0;
1019     const char *our_nodename = controld_globals.cluster->priv->node_name;
1020 
1021     if (st_timeout == 0
1022         || (stonith_api && (stonith_api->state != stonith_disconnected) &&
1023             stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
1024                                                            our_nodename))) {
1025         return pcmk__valid_stonith_watchdog_timeout(value);
1026     }
1027     return true;
1028 }
1029 
1030 /* end stonith API client functions */
1031 
1032 
1033 /*
1034  * stonith history synchronization
1035  *
1036  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
1037  * joins or leaves, we need to synchronize the history across all nodes.
1038  */
1039 
1040 static crm_trigger_t *stonith_history_sync_trigger = NULL;
1041 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
1042 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
1043 
1044 void
1045 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
1046 {
1047     if (free_timers) {
1048         mainloop_timer_del(stonith_history_sync_timer_short);
1049         stonith_history_sync_timer_short = NULL;
1050         mainloop_timer_del(stonith_history_sync_timer_long);
1051         stonith_history_sync_timer_long = NULL;
1052     } else {
1053         mainloop_timer_stop(stonith_history_sync_timer_short);
1054         mainloop_timer_stop(stonith_history_sync_timer_long);
1055     }
1056 
1057     if (st) {
1058         st->cmds->remove_notification(st, PCMK__VALUE_ST_NOTIFY_HISTORY_SYNCED);
1059     }
1060 }
1061 
1062 static void
1063 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
1064 {
1065     te_cleanup_stonith_history_sync(st, FALSE);
1066     crm_debug("Fence-history synced - cancel all timers");
1067 }
1068 
1069 static gboolean
1070 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1071 {
1072     mainloop_set_trigger(stonith_history_sync_trigger);
1073     return FALSE;
1074 }
1075 
1076 void
1077 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1078 {
1079     /* trigger a sync in 5s to give more nodes the
1080      * chance to show up so that we don't create
1081      * unnecessary stonith-history-sync traffic
1082      *
1083      * the long timeout of 30s is there as a fallback
1084      * so that after a successful connection to fenced
1085      * we will wait for 30s for the DC to trigger a
1086      * history-sync
1087      * if this doesn't happen we trigger a sync locally
1088      * (e.g. fenced segfaults and is restarted by pacemakerd)
1089      */
1090 
1091     /* as we are finally checking the stonith-connection
1092      * in do_stonith_history_sync we should be fine
1093      * leaving stonith_history_sync_time & stonith_history_sync_trigger
1094      * around
1095      */
1096     if (stonith_history_sync_trigger == NULL) {
1097         stonith_history_sync_trigger =
1098             mainloop_add_trigger(G_PRIORITY_LOW,
1099                                  do_stonith_history_sync, NULL);
1100     }
1101 
1102     if (long_timeout) {
1103         if(stonith_history_sync_timer_long == NULL) {
1104             stonith_history_sync_timer_long =
1105                 mainloop_timer_add("history_sync_long", 30000,
1106                                    FALSE, stonith_history_sync_set_trigger,
1107                                    NULL);
1108         }
1109         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1110         mainloop_timer_start(stonith_history_sync_timer_long);
1111     } else {
1112         if(stonith_history_sync_timer_short == NULL) {
1113             stonith_history_sync_timer_short =
1114                 mainloop_timer_add("history_sync_short", 5000,
1115                                    FALSE, stonith_history_sync_set_trigger,
1116                                    NULL);
1117         }
1118         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1119         mainloop_timer_start(stonith_history_sync_timer_short);
1120     }
1121 
1122 }
1123 
1124 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */