root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. too_many_st_failures
  4. st_fail_count_reset
  5. st_fail_count_increment
  6. cib_fencing_updated
  7. send_stonith_update
  8. abort_for_stonith_failure
  9. add_stonith_cleanup
  10. remove_stonith_cleanup
  11. purge_stonith_cleanup
  12. execute_stonith_cleanup
  13. fail_incompletable_stonith
  14. tengine_stonith_connection_destroy
  15. tengine_stonith_notify
  16. te_connect_stonith
  17. controld_trigger_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. te_fence_node
  23. te_cleanup_stonith_history_sync
  24. tengine_stonith_history_synced
  25. stonith_history_sync_set_trigger
  26. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2020 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/msg_xml.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/fencing/internal.h>
  15 
  16 #include <pacemaker-controld.h>
  17 
  18 static void
  19 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  20 
  21 /*
  22  * stonith failure counting
  23  *
  24  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  25  * number of fencing failures for each target node, and the most we'll restart a
  26  * transition for.
  27  */
  28 
  29 struct st_fail_rec {
  30     int count;
  31 };
  32 
  33 static bool fence_reaction_panic = FALSE;
  34 static unsigned long int stonith_max_attempts = 10;
  35 static GHashTable *stonith_failures = NULL;
  36 
  37 // crmd_opts defines default for stonith-max-attempts, so value is never NULL
  38 void
  39 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  40 {
  41     if (pcmk__str_eq(value, CRM_INFINITY_S, pcmk__str_casei)) {
  42        stonith_max_attempts = CRM_SCORE_INFINITY;
  43     } else {
  44        stonith_max_attempts = (unsigned long int) crm_parse_ll(value, NULL);
  45     }
  46 }
  47 
  48 void
  49 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  50 {
  51     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  52         fence_reaction_panic = TRUE;
  53 
  54     } else {
  55         if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
  56             crm_warn("Invalid value '%s' for %s, using 'stop'",
  57                      reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
  58         }
  59         fence_reaction_panic = FALSE;
  60     }
  61 }
  62 
  63 static gboolean
  64 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
  65 {
  66     GHashTableIter iter;
  67     const char *key = NULL;
  68     struct st_fail_rec *value = NULL;
  69 
  70     if (stonith_failures == NULL) {
  71         return FALSE;
  72     }
  73 
  74     if (target == NULL) {
  75         g_hash_table_iter_init(&iter, stonith_failures);
  76         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
  77                (gpointer *) &value)) {
  78 
  79             if (value->count >= stonith_max_attempts) {
  80                 target = (const char*)key;
  81                 goto too_many;
  82             }
  83         }
  84     } else {
  85         value = g_hash_table_lookup(stonith_failures, target);
  86         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
  87             goto too_many;
  88         }
  89     }
  90     return FALSE;
  91 
  92 too_many:
  93     crm_warn("Too many failures (%d) to fence %s, giving up",
  94              value->count, target);
  95     return TRUE;
  96 }
  97 
  98 /*!
  99  * \internal
 100  * \brief Reset a stonith fail count
 101  *
 102  * \param[in] target  Name of node to reset, or NULL for all
 103  */
 104 void
 105 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 106 {
 107     if (stonith_failures == NULL) {
 108         return;
 109     }
 110 
 111     if (target) {
 112         struct st_fail_rec *rec = NULL;
 113 
 114         rec = g_hash_table_lookup(stonith_failures, target);
 115         if (rec) {
 116             rec->count = 0;
 117         }
 118     } else {
 119         GHashTableIter iter;
 120         const char *key = NULL;
 121         struct st_fail_rec *rec = NULL;
 122 
 123         g_hash_table_iter_init(&iter, stonith_failures);
 124         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 125                                       (gpointer *) &rec)) {
 126             rec->count = 0;
 127         }
 128     }
 129 }
 130 
 131 static void
 132 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 133 {
 134     struct st_fail_rec *rec = NULL;
 135 
 136     if (stonith_failures == NULL) {
 137         stonith_failures = crm_str_table_new();
 138     }
 139 
 140     rec = g_hash_table_lookup(stonith_failures, target);
 141     if (rec) {
 142         rec->count++;
 143     } else {
 144         rec = malloc(sizeof(struct st_fail_rec));
 145         if(rec == NULL) {
 146             return;
 147         }
 148 
 149         rec->count = 1;
 150         g_hash_table_insert(stonith_failures, strdup(target), rec);
 151     }
 152 }
 153 
 154 /* end stonith fail count functions */
 155 
 156 
 157 static void
 158 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 159                     void *user_data)
 160 {
 161     if (rc < pcmk_ok) {
 162         crm_err("Fencing update %d for %s: failed - %s (%d)",
 163                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 164         crm_log_xml_warn(msg, "Failed update");
 165         abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
 166 
 167     } else {
 168         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 169     }
 170 }
 171 
 172 static void
 173 send_stonith_update(crm_action_t *action, const char *target, const char *uuid)
     /* [previous][next][first][last][top][bottom][index][help] */
 174 {
 175     int rc = pcmk_ok;
 176     crm_node_t *peer = NULL;
 177 
 178     /* We (usually) rely on the membership layer to do node_update_cluster,
 179      * and the peer status callback to do node_update_peer, because the node
 180      * might have already rejoined before we get the stonith result here.
 181      */
 182     int flags = node_update_join | node_update_expected;
 183 
 184     /* zero out the node-status & remove all LRM status info */
 185     xmlNode *node_state = NULL;
 186 
 187     CRM_CHECK(target != NULL, return);
 188     CRM_CHECK(uuid != NULL, return);
 189 
 190     /* Make sure the membership and join caches are accurate */
 191     peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
 192 
 193     CRM_CHECK(peer != NULL, return);
 194 
 195     if (peer->state == NULL) {
 196         /* Usually, we rely on the membership layer to update the cluster state
 197          * in the CIB. However, if the node has never been seen, do it here, so
 198          * the node is not considered unclean.
 199          */
 200         flags |= node_update_cluster;
 201     }
 202 
 203     if (peer->uuid == NULL) {
 204         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 205         peer->uuid = strdup(uuid);
 206     }
 207 
 208     crmd_peer_down(peer, TRUE);
 209 
 210     /* Generate a node state update for the CIB */
 211     node_state = create_node_state_update(peer, flags, NULL, __func__);
 212 
 213     /* we have to mark whether or not remote nodes have already been fenced */
 214     if (peer->flags & crm_remote_node) {
 215         time_t now = time(NULL);
 216         char *now_s = crm_itoa(now);
 217         crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
 218         free(now_s);
 219     }
 220 
 221     /* Force our known ID */
 222     crm_xml_add(node_state, XML_ATTR_UUID, uuid);
 223 
 224     rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
 225                                     cib_quorum_override | cib_scope_local | cib_can_create);
 226 
 227     /* Delay processing the trigger until the update completes */
 228     crm_debug("Sending fencing update %d for %s", rc, target);
 229     fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
 230 
 231     /* Make sure it sticks */
 232     /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
 233 
 234     controld_delete_node_state(peer->uname, controld_section_all,
 235                                cib_scope_local);
 236     free_xml(node_state);
 237     return;
 238 }
 239 
 240 /*!
 241  * \internal
 242  * \brief Abort transition due to stonith failure
 243  *
 244  * \param[in] abort_action  Whether to restart or stop transition
 245  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 246  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 247  */
 248 static void
 249 abort_for_stonith_failure(enum transition_action abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 250                           const char *target, xmlNode *reason)
 251 {
 252     /* If stonith repeatedly fails, we eventually give up on starting a new
 253      * transition for that reason.
 254      */
 255     if ((abort_action != tg_stop) && too_many_st_failures(target)) {
 256         abort_action = tg_stop;
 257     }
 258     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
 259 }
 260 
 261 
 262 /*
 263  * stonith cleanup list
 264  *
 265  * If the DC is shot, proper notifications might not go out.
 266  * The stonith cleanup list allows the cluster to (re-)send
 267  * notifications once a new DC is elected.
 268  */
 269 
 270 static GListPtr stonith_cleanup_list = NULL;
 271 
 272 /*!
 273  * \internal
 274  * \brief Add a node to the stonith cleanup list
 275  *
 276  * \param[in] target  Name of node to add
 277  */
 278 void
 279 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 280     stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
 281 }
 282 
 283 /*!
 284  * \internal
 285  * \brief Remove a node from the stonith cleanup list
 286  *
 287  * \param[in] Name of node to remove
 288  */
 289 void
 290 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 291 {
 292     GListPtr iter = stonith_cleanup_list;
 293 
 294     while (iter != NULL) {
 295         GListPtr tmp = iter;
 296         char *iter_name = tmp->data;
 297 
 298         iter = iter->next;
 299         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 300             crm_trace("Removing %s from the cleanup list", iter_name);
 301             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 302             free(iter_name);
 303         }
 304     }
 305 }
 306 
 307 /*!
 308  * \internal
 309  * \brief Purge all entries from the stonith cleanup list
 310  */
 311 void
 312 purge_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 313 {
 314     if (stonith_cleanup_list) {
 315         GListPtr iter = NULL;
 316 
 317         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 318             char *target = iter->data;
 319 
 320             crm_info("Purging %s from stonith cleanup list", target);
 321             free(target);
 322         }
 323         g_list_free(stonith_cleanup_list);
 324         stonith_cleanup_list = NULL;
 325     }
 326 }
 327 
 328 /*!
 329  * \internal
 330  * \brief Send stonith updates for all entries in cleanup list, then purge it
 331  */
 332 void
 333 execute_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 334 {
 335     GListPtr iter;
 336 
 337     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 338         char *target = iter->data;
 339         crm_node_t *target_node = crm_get_peer(0, target);
 340         const char *uuid = crm_peer_uuid(target_node);
 341 
 342         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 343         send_stonith_update(NULL, target, uuid);
 344         free(target);
 345     }
 346     g_list_free(stonith_cleanup_list);
 347     stonith_cleanup_list = NULL;
 348 }
 349 
 350 /* end stonith cleanup list functions */
 351 
 352 
 353 /* stonith API client
 354  *
 355  * Functions that need to interact directly with the fencer via its API
 356  */
 357 
 358 static stonith_t *stonith_api = NULL;
 359 static crm_trigger_t *stonith_reconnect = NULL;
 360 static char *te_client_id = NULL;
 361 
 362 static gboolean
 363 fail_incompletable_stonith(crm_graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 364 {
 365     GListPtr lpc = NULL;
 366     const char *task = NULL;
 367     xmlNode *last_action = NULL;
 368 
 369     if (graph == NULL) {
 370         return FALSE;
 371     }
 372 
 373     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 374         GListPtr lpc2 = NULL;
 375         synapse_t *synapse = (synapse_t *) lpc->data;
 376 
 377         if (synapse->confirmed) {
 378             continue;
 379         }
 380 
 381         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 382             crm_action_t *action = (crm_action_t *) lpc2->data;
 383 
 384             if (action->type != action_type_crm || action->confirmed) {
 385                 continue;
 386             }
 387 
 388             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 389             if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
 390                 action->failed = TRUE;
 391                 last_action = action->xml;
 392                 update_graph(graph, action);
 393                 crm_notice("Failing action %d (%s): fencer terminated",
 394                            action->id, ID(action->xml));
 395             }
 396         }
 397     }
 398 
 399     if (last_action != NULL) {
 400         crm_warn("Fencer failure resulted in unrunnable actions");
 401         abort_for_stonith_failure(tg_restart, NULL, last_action);
 402         return TRUE;
 403     }
 404 
 405     return FALSE;
 406 }
 407 
 408 static void
 409 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 410 {
 411     te_cleanup_stonith_history_sync(st, FALSE);
 412 
 413     if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 414         crm_crit("Fencing daemon connection failed");
 415         mainloop_set_trigger(stonith_reconnect);
 416 
 417     } else {
 418         crm_info("Fencing daemon disconnected");
 419     }
 420 
 421     if (stonith_api) {
 422         /* the client API won't properly reconnect notifications
 423          * if they are still in the table - so remove them
 424          */
 425         if (stonith_api->state != stonith_disconnected) {
 426             stonith_api->cmds->disconnect(st);
 427         }
 428         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
 429         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
 430         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
 431     }
 432 
 433     if (AM_I_DC) {
 434         fail_incompletable_stonith(transition_graph);
 435         trigger_graph();
 436     }
 437 }
 438 
 439 static void
 440 tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 441 {
 442     if (te_client_id == NULL) {
 443         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 444                                          (unsigned long) getpid());
 445     }
 446 
 447     if (st_event == NULL) {
 448         crm_err("Notify data not found");
 449         return;
 450     }
 451 
 452     crmd_alert_fencing_op(st_event);
 453 
 454     if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
 455         crm_notice("%s was successfully unfenced by %s (at the request of %s)",
 456                    st_event->target,
 457                    st_event->executioner? st_event->executioner : "<anyone>",
 458                    st_event->origin);
 459                 /* TODO: Hook up st_event->device */
 460         return;
 461 
 462     } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
 463         crm_err("Unfencing of %s by %s failed: %s (%d)",
 464                 st_event->target,
 465                 st_event->executioner? st_event->executioner : "<anyone>",
 466                 pcmk_strerror(st_event->result), st_event->result);
 467         return;
 468 
 469     } else if ((st_event->result == pcmk_ok)
 470                && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) {
 471 
 472         /* We were notified of our own fencing. Most likely, either fencing was
 473          * misconfigured, or fabric fencing that doesn't cut cluster
 474          * communication is in use.
 475          *
 476          * Either way, shutting down the local host is a good idea, to require
 477          * administrator intervention. Also, other nodes would otherwise likely
 478          * set our status to lost because of the fencing callback and discard
 479          * our subsequent election votes as "not part of our cluster".
 480          */
 481         crm_crit("We were allegedly just fenced by %s for %s!",
 482                  st_event->executioner? st_event->executioner : "the cluster",
 483                  st_event->origin); /* Dumps blackbox if enabled */
 484         if (fence_reaction_panic) {
 485             pcmk__panic(__func__);
 486         } else {
 487             crm_exit(CRM_EX_FATAL);
 488         }
 489         return;
 490     }
 491 
 492     /* Update the count of stonith failures for this target, in case we become
 493      * DC later. The current DC has already updated its fail count in
 494      * tengine_stonith_callback().
 495      */
 496     if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) {
 497         if (st_event->result == pcmk_ok) {
 498             st_fail_count_reset(st_event->target);
 499         } else {
 500             st_fail_count_increment(st_event->target);
 501         }
 502     }
 503 
 504     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
 505                CRM_XS " initiator=%s ref=%s",
 506                st_event->target, st_event->result == pcmk_ok ? "" : " not",
 507                st_event->action,
 508                st_event->executioner ? st_event->executioner : "<anyone>",
 509                (st_event->client_origin? st_event->client_origin : "<unknown>"),
 510                pcmk_strerror(st_event->result),
 511                st_event->origin, st_event->id);
 512 
 513     if (st_event->result == pcmk_ok) {
 514         crm_node_t *peer = crm_find_known_peer_full(0, st_event->target, CRM_GET_PEER_ANY);
 515         const char *uuid = NULL;
 516         gboolean we_are_executioner = pcmk__str_eq(st_event->executioner,
 517                                                    fsa_our_uname,
 518                                                    pcmk__str_casei);
 519 
 520         if (peer == NULL) {
 521             return;
 522         }
 523 
 524         uuid = crm_peer_uuid(peer);
 525 
 526         crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
 527         if(AM_I_DC) {
 528             /* The DC always sends updates */
 529             send_stonith_update(NULL, st_event->target, uuid);
 530 
 531             /* @TODO Ideally, at this point, we'd check whether the fenced node
 532              * hosted any guest nodes, and call remote_node_down() for them.
 533              * Unfortunately, the controller doesn't have a simple, reliable way
 534              * to map hosts to guests. It might be possible to track this in the
 535              * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
 536              * on the scheduler creating fence pseudo-events for the guests.
 537              */
 538 
 539             if (st_event->client_origin
 540                 && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) {
 541 
 542                 /* Abort the current transition graph if it wasn't us
 543                  * that invoked stonith to fence someone
 544                  */
 545                 crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
 546                 abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
 547             }
 548 
 549             /* Assume it was our leader if we don't currently have one */
 550         } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei)
 551                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 552 
 553             crm_notice("Fencing target %s %s our leader",
 554                        st_event->target, (fsa_our_dc? "was" : "may have been"));
 555 
 556             /* Given the CIB resyncing that occurs around elections,
 557              * have one node update the CIB now and, if the new DC is different,
 558              * have them do so too after the election
 559              */
 560             if (we_are_executioner) {
 561                 send_stonith_update(NULL, st_event->target, uuid);
 562             }
 563             add_stonith_cleanup(st_event->target);
 564         }
 565 
 566         /* If the target is a remote node, and we host its connection,
 567          * immediately fail all monitors so it can be recovered quickly.
 568          * The connection won't necessarily drop when a remote node is fenced,
 569          * so the failure might not otherwise be detected until the next poke.
 570          */
 571         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 572             remote_ra_fail(st_event->target);
 573         }
 574 
 575         crmd_peer_down(peer, TRUE);
 576      }
 577 }
 578 
 579 /*!
 580  * \brief Connect to fencer
 581  *
 582  * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
 583  *
 584  * \return TRUE
 585  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 586  *       30 attempts, meaning the controller could be blocked as long as 58s.
 587  */
 588 static gboolean
 589 te_connect_stonith(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 590 {
 591     int rc = pcmk_ok;
 592 
 593     if (stonith_api == NULL) {
 594         stonith_api = stonith_api_new();
 595         if (stonith_api == NULL) {
 596             crm_err("Could not connect to fencer: API memory allocation failed");
 597             return TRUE;
 598         }
 599     }
 600 
 601     if (stonith_api->state != stonith_disconnected) {
 602         crm_trace("Already connected to fencer, no need to retry");
 603         return TRUE;
 604     }
 605 
 606     if (user_data == NULL) {
 607         // Blocking (retry failures now until successful)
 608         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 609         if (rc != pcmk_ok) {
 610             crm_err("Could not connect to fencer in 30 attempts: %s "
 611                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 612         }
 613     } else {
 614         // Non-blocking (retry failures later in main loop)
 615         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 616         if (rc != pcmk_ok) {
 617             if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 618                 crm_notice("Fencer connection failed (will retry): %s "
 619                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 620                 mainloop_set_trigger(stonith_reconnect);
 621             } else {
 622                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 623                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 624             }
 625             return TRUE;
 626         }
 627     }
 628 
 629     if (rc == pcmk_ok) {
 630         stonith_api->cmds->register_notification(stonith_api,
 631                                                  T_STONITH_NOTIFY_DISCONNECT,
 632                                                  tengine_stonith_connection_destroy);
 633         stonith_api->cmds->register_notification(stonith_api,
 634                                                  T_STONITH_NOTIFY_FENCE,
 635                                                  tengine_stonith_notify);
 636         stonith_api->cmds->register_notification(stonith_api,
 637                                                  T_STONITH_NOTIFY_HISTORY_SYNCED,
 638                                                  tengine_stonith_history_synced);
 639         te_trigger_stonith_history_sync(TRUE);
 640         crm_notice("Fencer successfully connected");
 641     }
 642 
 643     return TRUE;
 644 }
 645 
 646 /*!
 647     \internal
 648     \brief Schedule fencer connection attempt in main loop
 649 */
 650 void
 651 controld_trigger_fencer_connect()
     /* [previous][next][first][last][top][bottom][index][help] */
 652 {
 653     if (stonith_reconnect == NULL) {
 654         stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
 655                                                  te_connect_stonith,
 656                                                  GINT_TO_POINTER(TRUE));
 657     }
 658     controld_set_fsa_input_flags(R_ST_REQUIRED);
 659     mainloop_set_trigger(stonith_reconnect);
 660 }
 661 
 662 void
 663 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 664 {
 665     if (stonith_api) {
 666         // Prevent fencer connection from coming up again
 667         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 668 
 669         if (stonith_api->state != stonith_disconnected) {
 670             stonith_api->cmds->disconnect(stonith_api);
 671         }
 672         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
 673         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
 674         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
 675     }
 676     if (destroy) {
 677         if (stonith_api) {
 678             stonith_api->cmds->free(stonith_api);
 679             stonith_api = NULL;
 680         }
 681         if (stonith_reconnect) {
 682             mainloop_destroy_trigger(stonith_reconnect);
 683             stonith_reconnect = NULL;
 684         }
 685         if (te_client_id) {
 686             free(te_client_id);
 687             te_client_id = NULL;
 688         }
 689     }
 690 }
 691 
 692 static gboolean
 693 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 694 {
 695     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 696         stonith_history_t *history = NULL;
 697 
 698         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 699         stonith_api->cmds->history(stonith_api,
 700                                    st_opt_sync_call | st_opt_broadcast,
 701                                    NULL, &history, 5);
 702         stonith_history_free(history);
 703         return TRUE;
 704     } else {
 705         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 706         return FALSE;
 707     }
 708 }
 709 
 710 static void
 711 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 712 {
 713     char *uuid = NULL;
 714     int stonith_id = -1;
 715     int transition_id = -1;
 716     crm_action_t *action = NULL;
 717     int call_id = data->call_id;
 718     int rc = data->rc;
 719     char *userdata = data->userdata;
 720 
 721     CRM_CHECK(userdata != NULL, return);
 722     crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
 723                pcmk_strerror(rc), rc);
 724 
 725     if (AM_I_DC == FALSE) {
 726         return;
 727     }
 728 
 729     /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
 730     /*       op->call_id, op->optype, op->node_name, op->op_result, */
 731     /*       (char *)op->node_list, op->private_data); */
 732 
 733     /* filter out old STONITH actions */
 734     CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
 735               goto bail);
 736 
 737     if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei)
 738         || transition_graph->id != transition_id) {
 739         crm_info("Ignoring STONITH action initiated outside of the current transition");
 740         goto bail;
 741     }
 742 
 743     action = controld_get_action(stonith_id);
 744     if (action == NULL) {
 745         crm_err("Stonith action not matched");
 746         goto bail;
 747     }
 748 
 749     stop_te_timer(action->timer);
 750     if (rc == pcmk_ok) {
 751         const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 752         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 753         const char *op = crm_meta_value(action->params, "stonith_action");
 754 
 755         crm_info("Stonith operation %d for %s passed", call_id, target);
 756         if (action->confirmed == FALSE) {
 757             te_action_confirmed(action, NULL);
 758             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
 759                 const char *value = NULL;
 760                 char *now = crm_ttoa(time(NULL));
 761 
 762                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL, FALSE);
 763                 free(now);
 764 
 765                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
 766                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL, FALSE);
 767 
 768                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
 769                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL, FALSE);
 770 
 771             } else if (action->sent_update == FALSE) {
 772                 send_stonith_update(action, target, uuid);
 773                 action->sent_update = TRUE;
 774             }
 775         }
 776         st_fail_count_reset(target);
 777 
 778     } else {
 779         const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 780         enum transition_action abort_action = tg_restart;
 781 
 782         action->failed = TRUE;
 783         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
 784                    call_id, target, pcmk_strerror(rc));
 785 
 786         /* If no fence devices were available, there's no use in immediately
 787          * checking again, so don't start a new transition in that case.
 788          */
 789         if (rc == -ENODEV) {
 790             crm_warn("No devices found in cluster to fence %s, giving up",
 791                      target);
 792             abort_action = tg_stop;
 793         }
 794 
 795         /* Increment the fail count now, so abort_for_stonith_failure() can
 796          * check it. Non-DC nodes will increment it in tengine_stonith_notify().
 797          */
 798         st_fail_count_increment(target);
 799         abort_for_stonith_failure(abort_action, target, NULL);
 800     }
 801 
 802     update_graph(transition_graph, action);
 803     trigger_graph();
 804 
 805   bail:
 806     free(userdata);
 807     free(uuid);
 808     return;
 809 }
 810 
 811 static int
 812 fence_with_delay(const char *target, const char *type, const char *delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 813 {
 814     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 815     int timeout_sec = (int) (transition_graph->stonith_timeout / 1000);
 816 
 817     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 818         stonith__set_call_options(options, target, st_opt_allow_suicide);
 819     }
 820     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 821                                                type, timeout_sec, 0,
 822                                                crm_atoi(delay, "0"));
 823 }
 824 
 825 gboolean
 826 te_fence_node(crm_graph_t *graph, crm_action_t *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 827 {
 828     int rc = 0;
 829     const char *id = NULL;
 830     const char *uuid = NULL;
 831     const char *target = NULL;
 832     const char *type = NULL;
 833     char *transition_key = NULL;
 834     const char *priority_delay = NULL;
 835     gboolean invalid_action = FALSE;
 836 
 837     id = ID(action->xml);
 838     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 839     uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 840     type = crm_meta_value(action->params, "stonith_action");
 841 
 842     CRM_CHECK(id != NULL, invalid_action = TRUE);
 843     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 844     CRM_CHECK(type != NULL, invalid_action = TRUE);
 845     CRM_CHECK(target != NULL, invalid_action = TRUE);
 846 
 847     if (invalid_action) {
 848         crm_log_xml_warn(action->xml, "BadAction");
 849         return FALSE;
 850     }
 851 
 852     priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 853 
 854     crm_notice("Requesting fencing (%s) of node %s "
 855                CRM_XS " action=%s timeout=%u%s%s",
 856                type, target, id, transition_graph->stonith_timeout,
 857                priority_delay ? " priority_delay=" : "",
 858                priority_delay ? priority_delay : "");
 859 
 860     /* Passing NULL means block until we can connect... */
 861     te_connect_stonith(NULL);
 862 
 863     rc = fence_with_delay(target, type, priority_delay);
 864     transition_key = pcmk__transition_key(transition_graph->id, action->id, 0,
 865                                           te_uuid),
 866     stonith_api->cmds->register_callback(stonith_api, rc,
 867                                          (int) (transition_graph->stonith_timeout / 1000),
 868                                          st_opt_timeout_updates, transition_key,
 869                                          "tengine_stonith_callback", tengine_stonith_callback);
 870 
 871     return TRUE;
 872 }
 873 
 874 /* end stonith API client functions */
 875 
 876 
 877 /*
 878  * stonith history synchronization
 879  *
 880  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
 881  * joins or leaves, we need to synchronize the history across all nodes.
 882  */
 883 
 884 static crm_trigger_t *stonith_history_sync_trigger = NULL;
 885 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
 886 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
 887 
 888 void
 889 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
 890 {
 891     if (free_timers) {
 892         mainloop_timer_del(stonith_history_sync_timer_short);
 893         stonith_history_sync_timer_short = NULL;
 894         mainloop_timer_del(stonith_history_sync_timer_long);
 895         stonith_history_sync_timer_long = NULL;
 896     } else {
 897         mainloop_timer_stop(stonith_history_sync_timer_short);
 898         mainloop_timer_stop(stonith_history_sync_timer_long);
 899     }
 900 
 901     if (st) {
 902         st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
 903     }
 904 }
 905 
 906 static void
 907 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 908 {
 909     te_cleanup_stonith_history_sync(st, FALSE);
 910     crm_debug("Fence-history synced - cancel all timers");
 911 }
 912 
 913 static gboolean
 914 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 915 {
 916     mainloop_set_trigger(stonith_history_sync_trigger);
 917     return FALSE;
 918 }
 919 
 920 void
 921 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
 922 {
 923     /* trigger a sync in 5s to give more nodes the
 924      * chance to show up so that we don't create
 925      * unnecessary stonith-history-sync traffic
 926      *
 927      * the long timeout of 30s is there as a fallback
 928      * so that after a successful connection to fenced
 929      * we will wait for 30s for the DC to trigger a
 930      * history-sync
 931      * if this doesn't happen we trigger a sync locally
 932      * (e.g. fenced segfaults and is restarted by pacemakerd)
 933      */
 934 
 935     /* as we are finally checking the stonith-connection
 936      * in do_stonith_history_sync we should be fine
 937      * leaving stonith_history_sync_time & stonith_history_sync_trigger
 938      * around
 939      */
 940     if (stonith_history_sync_trigger == NULL) {
 941         stonith_history_sync_trigger =
 942             mainloop_add_trigger(G_PRIORITY_LOW,
 943                                  do_stonith_history_sync, NULL);
 944     }
 945 
 946     if (long_timeout) {
 947         if(stonith_history_sync_timer_long == NULL) {
 948             stonith_history_sync_timer_long =
 949                 mainloop_timer_add("history_sync_long", 30000,
 950                                    FALSE, stonith_history_sync_set_trigger,
 951                                    NULL);
 952         }
 953         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
 954         mainloop_timer_start(stonith_history_sync_timer_long);
 955     } else {
 956         if(stonith_history_sync_timer_short == NULL) {
 957             stonith_history_sync_timer_short =
 958                 mainloop_timer_add("history_sync_short", 5000,
 959                                    FALSE, stonith_history_sync_set_trigger,
 960                                    NULL);
 961         }
 962         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
 963         mainloop_timer_start(stonith_history_sync_timer_short);
 964     }
 965 
 966 }
 967 
 968 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */