root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. too_many_st_failures
  4. st_fail_count_reset
  5. st_fail_count_increment
  6. cib_fencing_updated
  7. send_stonith_update
  8. abort_for_stonith_failure
  9. add_stonith_cleanup
  10. remove_stonith_cleanup
  11. purge_stonith_cleanup
  12. execute_stonith_cleanup
  13. fail_incompletable_stonith
  14. tengine_stonith_connection_destroy
  15. handle_fence_notification
  16. te_connect_stonith
  17. controld_trigger_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. te_fence_node
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2022 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/msg_xml.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/stonith-ng.h>
  15 #include <crm/fencing/internal.h>
  16 
  17 #include <pacemaker-controld.h>
  18 
  19 static void
  20 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  21 
  22 /*
  23  * stonith failure counting
  24  *
  25  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  26  * number of fencing failures for each target node, and the most we'll restart a
  27  * transition for.
  28  */
  29 
  30 struct st_fail_rec {
  31     int count;
  32 };
  33 
  34 static bool fence_reaction_panic = FALSE;
  35 static unsigned long int stonith_max_attempts = 10;
  36 static GHashTable *stonith_failures = NULL;
  37 
  38 void
  39 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  40 {
  41     stonith_max_attempts = char2score(value);
  42     if (stonith_max_attempts < 1UL) {
  43         stonith_max_attempts = 10UL;
  44     }
  45 }
  46 
  47 void
  48 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  49 {
  50     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  51         fence_reaction_panic = TRUE;
  52 
  53     } else {
  54         if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
  55             crm_warn("Invalid value '%s' for %s, using 'stop'",
  56                      reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
  57         }
  58         fence_reaction_panic = FALSE;
  59     }
  60 }
  61 
  62 static gboolean
  63 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
  64 {
  65     GHashTableIter iter;
  66     const char *key = NULL;
  67     struct st_fail_rec *value = NULL;
  68 
  69     if (stonith_failures == NULL) {
  70         return FALSE;
  71     }
  72 
  73     if (target == NULL) {
  74         g_hash_table_iter_init(&iter, stonith_failures);
  75         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
  76                (gpointer *) &value)) {
  77 
  78             if (value->count >= stonith_max_attempts) {
  79                 target = (const char*)key;
  80                 goto too_many;
  81             }
  82         }
  83     } else {
  84         value = g_hash_table_lookup(stonith_failures, target);
  85         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
  86             goto too_many;
  87         }
  88     }
  89     return FALSE;
  90 
  91 too_many:
  92     crm_warn("Too many failures (%d) to fence %s, giving up",
  93              value->count, target);
  94     return TRUE;
  95 }
  96 
  97 /*!
  98  * \internal
  99  * \brief Reset a stonith fail count
 100  *
 101  * \param[in] target  Name of node to reset, or NULL for all
 102  */
 103 void
 104 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 105 {
 106     if (stonith_failures == NULL) {
 107         return;
 108     }
 109 
 110     if (target) {
 111         struct st_fail_rec *rec = NULL;
 112 
 113         rec = g_hash_table_lookup(stonith_failures, target);
 114         if (rec) {
 115             rec->count = 0;
 116         }
 117     } else {
 118         GHashTableIter iter;
 119         const char *key = NULL;
 120         struct st_fail_rec *rec = NULL;
 121 
 122         g_hash_table_iter_init(&iter, stonith_failures);
 123         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 124                                       (gpointer *) &rec)) {
 125             rec->count = 0;
 126         }
 127     }
 128 }
 129 
 130 static void
 131 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 132 {
 133     struct st_fail_rec *rec = NULL;
 134 
 135     if (stonith_failures == NULL) {
 136         stonith_failures = pcmk__strkey_table(free, free);
 137     }
 138 
 139     rec = g_hash_table_lookup(stonith_failures, target);
 140     if (rec) {
 141         rec->count++;
 142     } else {
 143         rec = malloc(sizeof(struct st_fail_rec));
 144         if(rec == NULL) {
 145             return;
 146         }
 147 
 148         rec->count = 1;
 149         g_hash_table_insert(stonith_failures, strdup(target), rec);
 150     }
 151 }
 152 
 153 /* end stonith fail count functions */
 154 
 155 
 156 static void
 157 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 158                     void *user_data)
 159 {
 160     if (rc < pcmk_ok) {
 161         crm_err("Fencing update %d for %s: failed - %s (%d)",
 162                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 163         crm_log_xml_warn(msg, "Failed update");
 164         abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
 165 
 166     } else {
 167         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 168     }
 169 }
 170 
 171 static void
 172 send_stonith_update(crm_action_t *action, const char *target, const char *uuid)
     /* [previous][next][first][last][top][bottom][index][help] */
 173 {
 174     int rc = pcmk_ok;
 175     crm_node_t *peer = NULL;
 176 
 177     /* We (usually) rely on the membership layer to do node_update_cluster,
 178      * and the peer status callback to do node_update_peer, because the node
 179      * might have already rejoined before we get the stonith result here.
 180      */
 181     int flags = node_update_join | node_update_expected;
 182 
 183     /* zero out the node-status & remove all LRM status info */
 184     xmlNode *node_state = NULL;
 185 
 186     CRM_CHECK(target != NULL, return);
 187     CRM_CHECK(uuid != NULL, return);
 188 
 189     /* Make sure the membership and join caches are accurate */
 190     peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
 191 
 192     CRM_CHECK(peer != NULL, return);
 193 
 194     if (peer->state == NULL) {
 195         /* Usually, we rely on the membership layer to update the cluster state
 196          * in the CIB. However, if the node has never been seen, do it here, so
 197          * the node is not considered unclean.
 198          */
 199         flags |= node_update_cluster;
 200     }
 201 
 202     if (peer->uuid == NULL) {
 203         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 204         peer->uuid = strdup(uuid);
 205     }
 206 
 207     crmd_peer_down(peer, TRUE);
 208 
 209     /* Generate a node state update for the CIB */
 210     node_state = create_node_state_update(peer, flags, NULL, __func__);
 211 
 212     /* we have to mark whether or not remote nodes have already been fenced */
 213     if (peer->flags & crm_remote_node) {
 214         char *now_s = pcmk__ttoa(time(NULL));
 215 
 216         crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
 217         free(now_s);
 218     }
 219 
 220     /* Force our known ID */
 221     crm_xml_add(node_state, XML_ATTR_UUID, uuid);
 222 
 223     rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
 224                                     cib_quorum_override | cib_scope_local | cib_can_create);
 225 
 226     /* Delay processing the trigger until the update completes */
 227     crm_debug("Sending fencing update %d for %s", rc, target);
 228     fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
 229 
 230     /* Make sure it sticks */
 231     /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
 232 
 233     controld_delete_node_state(peer->uname, controld_section_all,
 234                                cib_scope_local);
 235     free_xml(node_state);
 236     return;
 237 }
 238 
 239 /*!
 240  * \internal
 241  * \brief Abort transition due to stonith failure
 242  *
 243  * \param[in] abort_action  Whether to restart or stop transition
 244  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 245  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 246  */
 247 static void
 248 abort_for_stonith_failure(enum transition_action abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 249                           const char *target, xmlNode *reason)
 250 {
 251     /* If stonith repeatedly fails, we eventually give up on starting a new
 252      * transition for that reason.
 253      */
 254     if ((abort_action != tg_stop) && too_many_st_failures(target)) {
 255         abort_action = tg_stop;
 256     }
 257     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
 258 }
 259 
 260 
 261 /*
 262  * stonith cleanup list
 263  *
 264  * If the DC is shot, proper notifications might not go out.
 265  * The stonith cleanup list allows the cluster to (re-)send
 266  * notifications once a new DC is elected.
 267  */
 268 
 269 static GList *stonith_cleanup_list = NULL;
 270 
 271 /*!
 272  * \internal
 273  * \brief Add a node to the stonith cleanup list
 274  *
 275  * \param[in] target  Name of node to add
 276  */
 277 void
 278 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 279     stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
 280 }
 281 
 282 /*!
 283  * \internal
 284  * \brief Remove a node from the stonith cleanup list
 285  *
 286  * \param[in] Name of node to remove
 287  */
 288 void
 289 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 290 {
 291     GList *iter = stonith_cleanup_list;
 292 
 293     while (iter != NULL) {
 294         GList *tmp = iter;
 295         char *iter_name = tmp->data;
 296 
 297         iter = iter->next;
 298         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 299             crm_trace("Removing %s from the cleanup list", iter_name);
 300             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 301             free(iter_name);
 302         }
 303     }
 304 }
 305 
 306 /*!
 307  * \internal
 308  * \brief Purge all entries from the stonith cleanup list
 309  */
 310 void
 311 purge_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 312 {
 313     if (stonith_cleanup_list) {
 314         GList *iter = NULL;
 315 
 316         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 317             char *target = iter->data;
 318 
 319             crm_info("Purging %s from stonith cleanup list", target);
 320             free(target);
 321         }
 322         g_list_free(stonith_cleanup_list);
 323         stonith_cleanup_list = NULL;
 324     }
 325 }
 326 
 327 /*!
 328  * \internal
 329  * \brief Send stonith updates for all entries in cleanup list, then purge it
 330  */
 331 void
 332 execute_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 333 {
 334     GList *iter;
 335 
 336     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 337         char *target = iter->data;
 338         crm_node_t *target_node = crm_get_peer(0, target);
 339         const char *uuid = crm_peer_uuid(target_node);
 340 
 341         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 342         send_stonith_update(NULL, target, uuid);
 343         free(target);
 344     }
 345     g_list_free(stonith_cleanup_list);
 346     stonith_cleanup_list = NULL;
 347 }
 348 
 349 /* end stonith cleanup list functions */
 350 
 351 
 352 /* stonith API client
 353  *
 354  * Functions that need to interact directly with the fencer via its API
 355  */
 356 
 357 static stonith_t *stonith_api = NULL;
 358 static crm_trigger_t *stonith_reconnect = NULL;
 359 static char *te_client_id = NULL;
 360 
 361 static gboolean
 362 fail_incompletable_stonith(crm_graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 363 {
 364     GList *lpc = NULL;
 365     const char *task = NULL;
 366     xmlNode *last_action = NULL;
 367 
 368     if (graph == NULL) {
 369         return FALSE;
 370     }
 371 
 372     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 373         GList *lpc2 = NULL;
 374         synapse_t *synapse = (synapse_t *) lpc->data;
 375 
 376         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 377             continue;
 378         }
 379 
 380         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 381             crm_action_t *action = (crm_action_t *) lpc2->data;
 382 
 383             if (action->type != action_type_crm || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 384                 continue;
 385             }
 386 
 387             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 388             if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
 389                 crm__set_graph_action_flags(action, pcmk__graph_action_failed);
 390                 last_action = action->xml;
 391                 pcmk__update_graph(graph, action);
 392                 crm_notice("Failing action %d (%s): fencer terminated",
 393                            action->id, ID(action->xml));
 394             }
 395         }
 396     }
 397 
 398     if (last_action != NULL) {
 399         crm_warn("Fencer failure resulted in unrunnable actions");
 400         abort_for_stonith_failure(tg_restart, NULL, last_action);
 401         return TRUE;
 402     }
 403 
 404     return FALSE;
 405 }
 406 
 407 static void
 408 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410     te_cleanup_stonith_history_sync(st, FALSE);
 411 
 412     if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 413         crm_crit("Fencing daemon connection failed");
 414         mainloop_set_trigger(stonith_reconnect);
 415 
 416     } else {
 417         crm_info("Fencing daemon disconnected");
 418     }
 419 
 420     if (stonith_api) {
 421         /* the client API won't properly reconnect notifications
 422          * if they are still in the table - so remove them
 423          */
 424         if (stonith_api->state != stonith_disconnected) {
 425             stonith_api->cmds->disconnect(st);
 426         }
 427         stonith_api->cmds->remove_notification(stonith_api, NULL);
 428     }
 429 
 430     if (AM_I_DC) {
 431         fail_incompletable_stonith(transition_graph);
 432         trigger_graph();
 433     }
 434 }
 435 
 436 /*!
 437  * \internal
 438  * \brief Handle an event notification from the fencing API
 439  *
 440  * \param[in] st     Fencing API connection
 441  * \param[in] event  Fencing API event notification
 442  */
 443 static void
 444 handle_fence_notification(stonith_t *st, stonith_event_t *event)
     /* [previous][next][first][last][top][bottom][index][help] */
 445 {
 446     bool succeeded = true;
 447     const char *executioner = "the cluster";
 448     const char *client = "a client";
 449     const char *reason = NULL;
 450     int exec_status;
 451 
 452     if (te_client_id == NULL) {
 453         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 454                                          (unsigned long) getpid());
 455     }
 456 
 457     if (event == NULL) {
 458         crm_err("Notify data not found");
 459         return;
 460     }
 461 
 462     if (event->executioner != NULL) {
 463         executioner = event->executioner;
 464     }
 465     if (event->client_origin != NULL) {
 466         client = event->client_origin;
 467     }
 468 
 469     exec_status = stonith__event_execution_status(event);
 470     if ((stonith__event_exit_status(event) != CRM_EX_OK)
 471         || (exec_status != PCMK_EXEC_DONE)) {
 472         succeeded = false;
 473         if (exec_status == PCMK_EXEC_DONE) {
 474             exec_status = PCMK_EXEC_ERROR;
 475         }
 476     }
 477     reason = stonith__event_exit_reason(event);
 478 
 479     crmd_alert_fencing_op(event);
 480 
 481     if (pcmk__str_eq("on", event->action, pcmk__str_none)) {
 482         // Unfencing doesn't need special handling, just a log message
 483         if (succeeded) {
 484             crm_notice("%s was unfenced by %s at the request of %s@%s",
 485                        event->target, executioner, client, event->origin);
 486                     /* TODO: Hook up event->device */
 487         } else {
 488             crm_err("Unfencing of %s by %s failed (%s%s%s) with exit status %d",
 489                     event->target, executioner,
 490                     pcmk_exec_status_str(exec_status),
 491                     ((reason == NULL)? "" : ": "),
 492                     ((reason == NULL)? "" : reason),
 493                     stonith__event_exit_status(event));
 494         }
 495         return;
 496     }
 497 
 498     if (succeeded
 499         && pcmk__str_eq(event->target, fsa_our_uname, pcmk__str_casei)) {
 500         /* We were notified of our own fencing. Most likely, either fencing was
 501          * misconfigured, or fabric fencing that doesn't cut cluster
 502          * communication is in use.
 503          *
 504          * Either way, shutting down the local host is a good idea, to require
 505          * administrator intervention. Also, other nodes would otherwise likely
 506          * set our status to lost because of the fencing callback and discard
 507          * our subsequent election votes as "not part of our cluster".
 508          */
 509         crm_crit("We were allegedly just fenced by %s for %s!",
 510                  executioner, event->origin); // Dumps blackbox if enabled
 511         if (fence_reaction_panic) {
 512             pcmk__panic(__func__);
 513         } else {
 514             crm_exit(CRM_EX_FATAL);
 515         }
 516         return; // Should never get here
 517     }
 518 
 519     /* Update the count of fencing failures for this target, in case we become
 520      * DC later. The current DC has already updated its fail count in
 521      * tengine_stonith_callback().
 522      */
 523     if (!AM_I_DC
 524         && pcmk__str_eq(event->operation, T_STONITH_NOTIFY_FENCE,
 525                         pcmk__str_none)) {
 526 
 527         if (succeeded) {
 528             st_fail_count_reset(event->target);
 529         } else {
 530             st_fail_count_increment(event->target);
 531         }
 532     }
 533 
 534     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s@%s: "
 535                "%s%s%s%s " CRM_XS " event=%s",
 536                event->target, (succeeded? "" : " not"),
 537                event->action, executioner, client, event->origin,
 538                (succeeded? "OK" : pcmk_exec_status_str(exec_status)),
 539                ((reason == NULL)? "" : " ("),
 540                ((reason == NULL)? "" : reason),
 541                ((reason == NULL)? "" : ")"),
 542                event->id);
 543 
 544     if (succeeded) {
 545         crm_node_t *peer = pcmk__search_known_node_cache(0, event->target,
 546                                                          CRM_GET_PEER_ANY);
 547         const char *uuid = NULL;
 548 
 549         if (peer == NULL) {
 550             return;
 551         }
 552 
 553         uuid = crm_peer_uuid(peer);
 554 
 555         if (AM_I_DC) {
 556             /* The DC always sends updates */
 557             send_stonith_update(NULL, event->target, uuid);
 558 
 559             /* @TODO Ideally, at this point, we'd check whether the fenced node
 560              * hosted any guest nodes, and call remote_node_down() for them.
 561              * Unfortunately, the controller doesn't have a simple, reliable way
 562              * to map hosts to guests. It might be possible to track this in the
 563              * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
 564              * on the scheduler creating fence pseudo-events for the guests.
 565              */
 566 
 567             if (!pcmk__str_eq(client, te_client_id, pcmk__str_casei)) {
 568                 /* Abort the current transition if it wasn't the cluster that
 569                  * initiated fencing.
 570                  */
 571                 crm_info("External fencing operation from %s fenced %s",
 572                          client, event->target);
 573                 abort_transition(INFINITY, tg_restart,
 574                                  "External Fencing Operation", NULL);
 575             }
 576 
 577             /* Assume it was our leader if we don't currently have one */
 578         } else if (pcmk__str_eq(fsa_our_dc, event->target,
 579                                 pcmk__str_null_matches|pcmk__str_casei)
 580                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 581 
 582             crm_notice("Fencing target %s %s our leader",
 583                        event->target, (fsa_our_dc? "was" : "may have been"));
 584 
 585             /* Given the CIB resyncing that occurs around elections,
 586              * have one node update the CIB now and, if the new DC is different,
 587              * have them do so too after the election
 588              */
 589             if (pcmk__str_eq(event->executioner, fsa_our_uname,
 590                              pcmk__str_casei)) {
 591                 send_stonith_update(NULL, event->target, uuid);
 592             }
 593             add_stonith_cleanup(event->target);
 594         }
 595 
 596         /* If the target is a remote node, and we host its connection,
 597          * immediately fail all monitors so it can be recovered quickly.
 598          * The connection won't necessarily drop when a remote node is fenced,
 599          * so the failure might not otherwise be detected until the next poke.
 600          */
 601         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 602             remote_ra_fail(event->target);
 603         }
 604 
 605         crmd_peer_down(peer, TRUE);
 606      }
 607 }
 608 
 609 /*!
 610  * \brief Connect to fencer
 611  *
 612  * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
 613  *
 614  * \return TRUE
 615  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 616  *       30 attempts, meaning the controller could be blocked as long as 58s.
 617  */
 618 static gboolean
 619 te_connect_stonith(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 620 {
 621     int rc = pcmk_ok;
 622 
 623     if (stonith_api == NULL) {
 624         stonith_api = stonith_api_new();
 625         if (stonith_api == NULL) {
 626             crm_err("Could not connect to fencer: API memory allocation failed");
 627             return TRUE;
 628         }
 629     }
 630 
 631     if (stonith_api->state != stonith_disconnected) {
 632         crm_trace("Already connected to fencer, no need to retry");
 633         return TRUE;
 634     }
 635 
 636     if (user_data == NULL) {
 637         // Blocking (retry failures now until successful)
 638         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 639         if (rc != pcmk_ok) {
 640             crm_err("Could not connect to fencer in 30 attempts: %s "
 641                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 642         }
 643     } else {
 644         // Non-blocking (retry failures later in main loop)
 645         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 646         if (rc != pcmk_ok) {
 647             if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 648                 crm_notice("Fencer connection failed (will retry): %s "
 649                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 650                 mainloop_set_trigger(stonith_reconnect);
 651             } else {
 652                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 653                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 654             }
 655             return TRUE;
 656         }
 657     }
 658 
 659     if (rc == pcmk_ok) {
 660         stonith_api->cmds->register_notification(stonith_api,
 661                                                  T_STONITH_NOTIFY_DISCONNECT,
 662                                                  tengine_stonith_connection_destroy);
 663         stonith_api->cmds->register_notification(stonith_api,
 664                                                  T_STONITH_NOTIFY_FENCE,
 665                                                  handle_fence_notification);
 666         stonith_api->cmds->register_notification(stonith_api,
 667                                                  T_STONITH_NOTIFY_HISTORY_SYNCED,
 668                                                  tengine_stonith_history_synced);
 669         te_trigger_stonith_history_sync(TRUE);
 670         crm_notice("Fencer successfully connected");
 671     }
 672 
 673     return TRUE;
 674 }
 675 
 676 /*!
 677     \internal
 678     \brief Schedule fencer connection attempt in main loop
 679 */
 680 void
 681 controld_trigger_fencer_connect()
     /* [previous][next][first][last][top][bottom][index][help] */
 682 {
 683     if (stonith_reconnect == NULL) {
 684         stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
 685                                                  te_connect_stonith,
 686                                                  GINT_TO_POINTER(TRUE));
 687     }
 688     controld_set_fsa_input_flags(R_ST_REQUIRED);
 689     mainloop_set_trigger(stonith_reconnect);
 690 }
 691 
 692 void
 693 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 694 {
 695     if (stonith_api) {
 696         // Prevent fencer connection from coming up again
 697         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 698 
 699         if (stonith_api->state != stonith_disconnected) {
 700             stonith_api->cmds->disconnect(stonith_api);
 701         }
 702         stonith_api->cmds->remove_notification(stonith_api, NULL);
 703     }
 704     if (destroy) {
 705         if (stonith_api) {
 706             stonith_api->cmds->free(stonith_api);
 707             stonith_api = NULL;
 708         }
 709         if (stonith_reconnect) {
 710             mainloop_destroy_trigger(stonith_reconnect);
 711             stonith_reconnect = NULL;
 712         }
 713         if (te_client_id) {
 714             free(te_client_id);
 715             te_client_id = NULL;
 716         }
 717     }
 718 }
 719 
 720 static gboolean
 721 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 722 {
 723     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 724         stonith_history_t *history = NULL;
 725 
 726         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 727         stonith_api->cmds->history(stonith_api,
 728                                    st_opt_sync_call | st_opt_broadcast,
 729                                    NULL, &history, 5);
 730         stonith_history_free(history);
 731         return TRUE;
 732     } else {
 733         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 734         return FALSE;
 735     }
 736 }
 737 
 738 static void
 739 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 740 {
 741     char *uuid = NULL;
 742     int stonith_id = -1;
 743     int transition_id = -1;
 744     crm_action_t *action = NULL;
 745     const char *target = NULL;
 746 
 747     if ((data == NULL) || (data->userdata == NULL)) {
 748         crm_err("Ignoring fence operation %d result: "
 749                 "No transition key given (bug?)",
 750                 ((data == NULL)? -1 : data->call_id));
 751         return;
 752     }
 753 
 754     if (!AM_I_DC) {
 755         const char *reason = stonith__exit_reason(data);
 756 
 757         if (reason == NULL) {
 758            reason = pcmk_exec_status_str(stonith__execution_status(data));
 759         }
 760         crm_notice("Result of fence operation %d: %d (%s) " CRM_XS " key=%s",
 761                    data->call_id, stonith__exit_status(data), reason,
 762                    (const char *) data->userdata);
 763         return;
 764     }
 765 
 766     CRM_CHECK(decode_transition_key(data->userdata, &uuid, &transition_id,
 767                                     &stonith_id, NULL),
 768               goto bail);
 769 
 770     if (transition_graph->complete || (stonith_id < 0)
 771         || !pcmk__str_eq(uuid, te_uuid, pcmk__str_none)
 772         || (transition_graph->id != transition_id)) {
 773         crm_info("Ignoring fence operation %d result: "
 774                  "Not from current transition " CRM_XS
 775                  " complete=%s action=%d uuid=%s (vs %s) transition=%d (vs %d)",
 776                  data->call_id, pcmk__btoa(transition_graph->complete),
 777                  stonith_id, uuid, te_uuid, transition_id, transition_graph->id);
 778         goto bail;
 779     }
 780 
 781     action = controld_get_action(stonith_id);
 782     if (action == NULL) {
 783         crm_err("Ignoring fence operation %d result: "
 784                 "Action %d not found in transition graph (bug?) "
 785                 CRM_XS " uuid=%s transition=%d",
 786                 data->call_id, stonith_id, uuid, transition_id);
 787         goto bail;
 788     }
 789 
 790     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 791     if (target == NULL) {
 792         crm_err("Ignoring fence operation %d result: No target given (bug?)",
 793                 data->call_id);
 794         goto bail;
 795     }
 796 
 797     stop_te_timer(action->timer);
 798     if (stonith__exit_status(data) == CRM_EX_OK) {
 799         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 800         const char *op = crm_meta_value(action->params, "stonith_action");
 801 
 802         crm_notice("Fence operation %d for %s passed", data->call_id, target);
 803         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 804             te_action_confirmed(action, NULL);
 805             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
 806                 const char *value = NULL;
 807                 char *now = pcmk__ttoa(time(NULL));
 808                 gboolean is_remote_node = FALSE;
 809 
 810                 /* This check is not 100% reliable, since this node is not
 811                  * guaranteed to have the remote node cached. However, it
 812                  * doesn't have to be reliable, since the attribute manager can
 813                  * learn a node's "remoteness" by other means sooner or later.
 814                  * This allows it to learn more quickly if this node does have
 815                  * the information.
 816                  */
 817                 if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
 818                     is_remote_node = TRUE;
 819                 }
 820 
 821                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 822                              is_remote_node);
 823                 free(now);
 824 
 825                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
 826                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 827                              is_remote_node);
 828 
 829                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
 830                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 831                              is_remote_node);
 832 
 833             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 834                 send_stonith_update(action, target, uuid);
 835                 crm__set_graph_action_flags(action, pcmk__graph_action_sent_update);
 836             }
 837         }
 838         st_fail_count_reset(target);
 839 
 840     } else {
 841         enum transition_action abort_action = tg_restart;
 842         int status = stonith__execution_status(data);
 843         const char *reason = stonith__exit_reason(data);
 844 
 845         if (reason == NULL) {
 846             if (status == PCMK_EXEC_DONE) {
 847                 reason = "Agent returned error";
 848             } else {
 849                 reason = pcmk_exec_status_str(status);
 850             }
 851         }
 852         crm__set_graph_action_flags(action, pcmk__graph_action_failed);
 853 
 854         /* If no fence devices were available, there's no use in immediately
 855          * checking again, so don't start a new transition in that case.
 856          */
 857         if (status == PCMK_EXEC_NO_FENCE_DEVICE) {
 858             crm_warn("Fence operation %d for %s failed: %s "
 859                      "(aborting transition and giving up for now)",
 860                      data->call_id, target, reason);
 861             abort_action = tg_stop;
 862         } else {
 863             crm_notice("Fence operation %d for %s failed: %s "
 864                        "(aborting transition)", data->call_id, target, reason);
 865         }
 866 
 867         /* Increment the fail count now, so abort_for_stonith_failure() can
 868          * check it. Non-DC nodes will increment it in
 869          * handle_fence_notification().
 870          */
 871         st_fail_count_increment(target);
 872         abort_for_stonith_failure(abort_action, target, NULL);
 873     }
 874 
 875     pcmk__update_graph(transition_graph, action);
 876     trigger_graph();
 877 
 878   bail:
 879     free(data->userdata);
 880     free(uuid);
 881     return;
 882 }
 883 
 884 static int
 885 fence_with_delay(const char *target, const char *type, const char *delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 886 {
 887     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 888     int timeout_sec = (int) (transition_graph->stonith_timeout / 1000);
 889     int delay_i;
 890 
 891     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 892         stonith__set_call_options(options, target, st_opt_allow_suicide);
 893     }
 894     pcmk__scan_min_int(delay, &delay_i, 0);
 895     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 896                                                type, timeout_sec, 0, delay_i);
 897 }
 898 
 899 gboolean
 900 te_fence_node(crm_graph_t *graph, crm_action_t *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 901 {
 902     int rc = 0;
 903     const char *id = NULL;
 904     const char *uuid = NULL;
 905     const char *target = NULL;
 906     const char *type = NULL;
 907     char *transition_key = NULL;
 908     const char *priority_delay = NULL;
 909     gboolean invalid_action = FALSE;
 910 
 911     id = ID(action->xml);
 912     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 913     uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 914     type = crm_meta_value(action->params, "stonith_action");
 915 
 916     CRM_CHECK(id != NULL, invalid_action = TRUE);
 917     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 918     CRM_CHECK(type != NULL, invalid_action = TRUE);
 919     CRM_CHECK(target != NULL, invalid_action = TRUE);
 920 
 921     if (invalid_action) {
 922         crm_log_xml_warn(action->xml, "BadAction");
 923         return FALSE;
 924     }
 925 
 926     priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 927 
 928     crm_notice("Requesting fencing (%s) of node %s "
 929                CRM_XS " action=%s timeout=%u%s%s",
 930                type, target, id, transition_graph->stonith_timeout,
 931                priority_delay ? " priority_delay=" : "",
 932                priority_delay ? priority_delay : "");
 933 
 934     /* Passing NULL means block until we can connect... */
 935     te_connect_stonith(NULL);
 936 
 937     rc = fence_with_delay(target, type, priority_delay);
 938     transition_key = pcmk__transition_key(transition_graph->id, action->id, 0,
 939                                           te_uuid),
 940     stonith_api->cmds->register_callback(stonith_api, rc,
 941                                          (int) (transition_graph->stonith_timeout / 1000),
 942                                          st_opt_timeout_updates, transition_key,
 943                                          "tengine_stonith_callback", tengine_stonith_callback);
 944 
 945     return TRUE;
 946 }
 947 
 948 bool
 949 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 950 {
 951     gboolean rv = TRUE;
 952 
 953     if (stonith_api && (stonith_api->state != stonith_disconnected) &&
 954         stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
 955                                                        fsa_our_uname)) {
 956         rv = pcmk__valid_sbd_timeout(value);
 957     }
 958     return rv;
 959 }
 960 
 961 /* end stonith API client functions */
 962 
 963 
 964 /*
 965  * stonith history synchronization
 966  *
 967  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
 968  * joins or leaves, we need to synchronize the history across all nodes.
 969  */
 970 
 971 static crm_trigger_t *stonith_history_sync_trigger = NULL;
 972 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
 973 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
 974 
 975 void
 976 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
 977 {
 978     if (free_timers) {
 979         mainloop_timer_del(stonith_history_sync_timer_short);
 980         stonith_history_sync_timer_short = NULL;
 981         mainloop_timer_del(stonith_history_sync_timer_long);
 982         stonith_history_sync_timer_long = NULL;
 983     } else {
 984         mainloop_timer_stop(stonith_history_sync_timer_short);
 985         mainloop_timer_stop(stonith_history_sync_timer_long);
 986     }
 987 
 988     if (st) {
 989         st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
 990     }
 991 }
 992 
 993 static void
 994 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 995 {
 996     te_cleanup_stonith_history_sync(st, FALSE);
 997     crm_debug("Fence-history synced - cancel all timers");
 998 }
 999 
1000 static gboolean
1001 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
1002 {
1003     mainloop_set_trigger(stonith_history_sync_trigger);
1004     return FALSE;
1005 }
1006 
1007 void
1008 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
1009 {
1010     /* trigger a sync in 5s to give more nodes the
1011      * chance to show up so that we don't create
1012      * unnecessary stonith-history-sync traffic
1013      *
1014      * the long timeout of 30s is there as a fallback
1015      * so that after a successful connection to fenced
1016      * we will wait for 30s for the DC to trigger a
1017      * history-sync
1018      * if this doesn't happen we trigger a sync locally
1019      * (e.g. fenced segfaults and is restarted by pacemakerd)
1020      */
1021 
1022     /* as we are finally checking the stonith-connection
1023      * in do_stonith_history_sync we should be fine
1024      * leaving stonith_history_sync_time & stonith_history_sync_trigger
1025      * around
1026      */
1027     if (stonith_history_sync_trigger == NULL) {
1028         stonith_history_sync_trigger =
1029             mainloop_add_trigger(G_PRIORITY_LOW,
1030                                  do_stonith_history_sync, NULL);
1031     }
1032 
1033     if (long_timeout) {
1034         if(stonith_history_sync_timer_long == NULL) {
1035             stonith_history_sync_timer_long =
1036                 mainloop_timer_add("history_sync_long", 30000,
1037                                    FALSE, stonith_history_sync_set_trigger,
1038                                    NULL);
1039         }
1040         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
1041         mainloop_timer_start(stonith_history_sync_timer_long);
1042     } else {
1043         if(stonith_history_sync_timer_short == NULL) {
1044             stonith_history_sync_timer_short =
1045                 mainloop_timer_add("history_sync_short", 5000,
1046                                    FALSE, stonith_history_sync_set_trigger,
1047                                    NULL);
1048         }
1049         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
1050         mainloop_timer_start(stonith_history_sync_timer_short);
1051     }
1052 
1053 }
1054 
1055 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */