root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. too_many_st_failures
  4. st_fail_count_reset
  5. st_fail_count_increment
  6. cib_fencing_updated
  7. send_stonith_update
  8. abort_for_stonith_failure
  9. add_stonith_cleanup
  10. remove_stonith_cleanup
  11. purge_stonith_cleanup
  12. execute_stonith_cleanup
  13. fail_incompletable_stonith
  14. tengine_stonith_connection_destroy
  15. tengine_stonith_notify
  16. te_connect_stonith
  17. controld_trigger_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. te_fence_node
  23. controld_verify_stonith_watchdog_timeout
  24. te_cleanup_stonith_history_sync
  25. tengine_stonith_history_synced
  26. stonith_history_sync_set_trigger
  27. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2021 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/msg_xml.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/stonith-ng.h>
  15 #include <crm/fencing/internal.h>
  16 
  17 #include <pacemaker-controld.h>
  18 
  19 static void
  20 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  21 
  22 /*
  23  * stonith failure counting
  24  *
  25  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  26  * number of fencing failures for each target node, and the most we'll restart a
  27  * transition for.
  28  */
  29 
  30 struct st_fail_rec {
  31     int count;
  32 };
  33 
  34 static bool fence_reaction_panic = FALSE;
  35 static unsigned long int stonith_max_attempts = 10;
  36 static GHashTable *stonith_failures = NULL;
  37 
  38 void
  39 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  40 {
  41     stonith_max_attempts = char2score(value);
  42     if (stonith_max_attempts < 1UL) {
  43         stonith_max_attempts = 10UL;
  44     }
  45 }
  46 
  47 void
  48 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  49 {
  50     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  51         fence_reaction_panic = TRUE;
  52 
  53     } else {
  54         if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
  55             crm_warn("Invalid value '%s' for %s, using 'stop'",
  56                      reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
  57         }
  58         fence_reaction_panic = FALSE;
  59     }
  60 }
  61 
  62 static gboolean
  63 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
  64 {
  65     GHashTableIter iter;
  66     const char *key = NULL;
  67     struct st_fail_rec *value = NULL;
  68 
  69     if (stonith_failures == NULL) {
  70         return FALSE;
  71     }
  72 
  73     if (target == NULL) {
  74         g_hash_table_iter_init(&iter, stonith_failures);
  75         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
  76                (gpointer *) &value)) {
  77 
  78             if (value->count >= stonith_max_attempts) {
  79                 target = (const char*)key;
  80                 goto too_many;
  81             }
  82         }
  83     } else {
  84         value = g_hash_table_lookup(stonith_failures, target);
  85         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
  86             goto too_many;
  87         }
  88     }
  89     return FALSE;
  90 
  91 too_many:
  92     crm_warn("Too many failures (%d) to fence %s, giving up",
  93              value->count, target);
  94     return TRUE;
  95 }
  96 
  97 /*!
  98  * \internal
  99  * \brief Reset a stonith fail count
 100  *
 101  * \param[in] target  Name of node to reset, or NULL for all
 102  */
 103 void
 104 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 105 {
 106     if (stonith_failures == NULL) {
 107         return;
 108     }
 109 
 110     if (target) {
 111         struct st_fail_rec *rec = NULL;
 112 
 113         rec = g_hash_table_lookup(stonith_failures, target);
 114         if (rec) {
 115             rec->count = 0;
 116         }
 117     } else {
 118         GHashTableIter iter;
 119         const char *key = NULL;
 120         struct st_fail_rec *rec = NULL;
 121 
 122         g_hash_table_iter_init(&iter, stonith_failures);
 123         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 124                                       (gpointer *) &rec)) {
 125             rec->count = 0;
 126         }
 127     }
 128 }
 129 
 130 static void
 131 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 132 {
 133     struct st_fail_rec *rec = NULL;
 134 
 135     if (stonith_failures == NULL) {
 136         stonith_failures = pcmk__strkey_table(free, free);
 137     }
 138 
 139     rec = g_hash_table_lookup(stonith_failures, target);
 140     if (rec) {
 141         rec->count++;
 142     } else {
 143         rec = malloc(sizeof(struct st_fail_rec));
 144         if(rec == NULL) {
 145             return;
 146         }
 147 
 148         rec->count = 1;
 149         g_hash_table_insert(stonith_failures, strdup(target), rec);
 150     }
 151 }
 152 
 153 /* end stonith fail count functions */
 154 
 155 
 156 static void
 157 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 158                     void *user_data)
 159 {
 160     if (rc < pcmk_ok) {
 161         crm_err("Fencing update %d for %s: failed - %s (%d)",
 162                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 163         crm_log_xml_warn(msg, "Failed update");
 164         abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
 165 
 166     } else {
 167         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 168     }
 169 }
 170 
 171 static void
 172 send_stonith_update(crm_action_t *action, const char *target, const char *uuid)
     /* [previous][next][first][last][top][bottom][index][help] */
 173 {
 174     int rc = pcmk_ok;
 175     crm_node_t *peer = NULL;
 176 
 177     /* We (usually) rely on the membership layer to do node_update_cluster,
 178      * and the peer status callback to do node_update_peer, because the node
 179      * might have already rejoined before we get the stonith result here.
 180      */
 181     int flags = node_update_join | node_update_expected;
 182 
 183     /* zero out the node-status & remove all LRM status info */
 184     xmlNode *node_state = NULL;
 185 
 186     CRM_CHECK(target != NULL, return);
 187     CRM_CHECK(uuid != NULL, return);
 188 
 189     /* Make sure the membership and join caches are accurate */
 190     peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
 191 
 192     CRM_CHECK(peer != NULL, return);
 193 
 194     if (peer->state == NULL) {
 195         /* Usually, we rely on the membership layer to update the cluster state
 196          * in the CIB. However, if the node has never been seen, do it here, so
 197          * the node is not considered unclean.
 198          */
 199         flags |= node_update_cluster;
 200     }
 201 
 202     if (peer->uuid == NULL) {
 203         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 204         peer->uuid = strdup(uuid);
 205     }
 206 
 207     crmd_peer_down(peer, TRUE);
 208 
 209     /* Generate a node state update for the CIB */
 210     node_state = create_node_state_update(peer, flags, NULL, __func__);
 211 
 212     /* we have to mark whether or not remote nodes have already been fenced */
 213     if (peer->flags & crm_remote_node) {
 214         char *now_s = pcmk__ttoa(time(NULL));
 215 
 216         crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
 217         free(now_s);
 218     }
 219 
 220     /* Force our known ID */
 221     crm_xml_add(node_state, XML_ATTR_UUID, uuid);
 222 
 223     rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
 224                                     cib_quorum_override | cib_scope_local | cib_can_create);
 225 
 226     /* Delay processing the trigger until the update completes */
 227     crm_debug("Sending fencing update %d for %s", rc, target);
 228     fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
 229 
 230     /* Make sure it sticks */
 231     /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
 232 
 233     controld_delete_node_state(peer->uname, controld_section_all,
 234                                cib_scope_local);
 235     free_xml(node_state);
 236     return;
 237 }
 238 
 239 /*!
 240  * \internal
 241  * \brief Abort transition due to stonith failure
 242  *
 243  * \param[in] abort_action  Whether to restart or stop transition
 244  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 245  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 246  */
 247 static void
 248 abort_for_stonith_failure(enum transition_action abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 249                           const char *target, xmlNode *reason)
 250 {
 251     /* If stonith repeatedly fails, we eventually give up on starting a new
 252      * transition for that reason.
 253      */
 254     if ((abort_action != tg_stop) && too_many_st_failures(target)) {
 255         abort_action = tg_stop;
 256     }
 257     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
 258 }
 259 
 260 
 261 /*
 262  * stonith cleanup list
 263  *
 264  * If the DC is shot, proper notifications might not go out.
 265  * The stonith cleanup list allows the cluster to (re-)send
 266  * notifications once a new DC is elected.
 267  */
 268 
 269 static GList *stonith_cleanup_list = NULL;
 270 
 271 /*!
 272  * \internal
 273  * \brief Add a node to the stonith cleanup list
 274  *
 275  * \param[in] target  Name of node to add
 276  */
 277 void
 278 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 279     stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
 280 }
 281 
 282 /*!
 283  * \internal
 284  * \brief Remove a node from the stonith cleanup list
 285  *
 286  * \param[in] Name of node to remove
 287  */
 288 void
 289 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 290 {
 291     GList *iter = stonith_cleanup_list;
 292 
 293     while (iter != NULL) {
 294         GList *tmp = iter;
 295         char *iter_name = tmp->data;
 296 
 297         iter = iter->next;
 298         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 299             crm_trace("Removing %s from the cleanup list", iter_name);
 300             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 301             free(iter_name);
 302         }
 303     }
 304 }
 305 
 306 /*!
 307  * \internal
 308  * \brief Purge all entries from the stonith cleanup list
 309  */
 310 void
 311 purge_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 312 {
 313     if (stonith_cleanup_list) {
 314         GList *iter = NULL;
 315 
 316         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 317             char *target = iter->data;
 318 
 319             crm_info("Purging %s from stonith cleanup list", target);
 320             free(target);
 321         }
 322         g_list_free(stonith_cleanup_list);
 323         stonith_cleanup_list = NULL;
 324     }
 325 }
 326 
 327 /*!
 328  * \internal
 329  * \brief Send stonith updates for all entries in cleanup list, then purge it
 330  */
 331 void
 332 execute_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 333 {
 334     GList *iter;
 335 
 336     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 337         char *target = iter->data;
 338         crm_node_t *target_node = crm_get_peer(0, target);
 339         const char *uuid = crm_peer_uuid(target_node);
 340 
 341         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 342         send_stonith_update(NULL, target, uuid);
 343         free(target);
 344     }
 345     g_list_free(stonith_cleanup_list);
 346     stonith_cleanup_list = NULL;
 347 }
 348 
 349 /* end stonith cleanup list functions */
 350 
 351 
 352 /* stonith API client
 353  *
 354  * Functions that need to interact directly with the fencer via its API
 355  */
 356 
 357 static stonith_t *stonith_api = NULL;
 358 static crm_trigger_t *stonith_reconnect = NULL;
 359 static char *te_client_id = NULL;
 360 
 361 static gboolean
 362 fail_incompletable_stonith(crm_graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 363 {
 364     GList *lpc = NULL;
 365     const char *task = NULL;
 366     xmlNode *last_action = NULL;
 367 
 368     if (graph == NULL) {
 369         return FALSE;
 370     }
 371 
 372     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 373         GList *lpc2 = NULL;
 374         synapse_t *synapse = (synapse_t *) lpc->data;
 375 
 376         if (pcmk_is_set(synapse->flags, pcmk__synapse_confirmed)) {
 377             continue;
 378         }
 379 
 380         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 381             crm_action_t *action = (crm_action_t *) lpc2->data;
 382 
 383             if (action->type != action_type_crm || pcmk_is_set(action->flags, pcmk__graph_action_confirmed)) {
 384                 continue;
 385             }
 386 
 387             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 388             if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
 389                 crm__set_graph_action_flags(action, pcmk__graph_action_failed);
 390                 last_action = action->xml;
 391                 pcmk__update_graph(graph, action);
 392                 crm_notice("Failing action %d (%s): fencer terminated",
 393                            action->id, ID(action->xml));
 394             }
 395         }
 396     }
 397 
 398     if (last_action != NULL) {
 399         crm_warn("Fencer failure resulted in unrunnable actions");
 400         abort_for_stonith_failure(tg_restart, NULL, last_action);
 401         return TRUE;
 402     }
 403 
 404     return FALSE;
 405 }
 406 
 407 static void
 408 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410     te_cleanup_stonith_history_sync(st, FALSE);
 411 
 412     if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 413         crm_crit("Fencing daemon connection failed");
 414         mainloop_set_trigger(stonith_reconnect);
 415 
 416     } else {
 417         crm_info("Fencing daemon disconnected");
 418     }
 419 
 420     if (stonith_api) {
 421         /* the client API won't properly reconnect notifications
 422          * if they are still in the table - so remove them
 423          */
 424         if (stonith_api->state != stonith_disconnected) {
 425             stonith_api->cmds->disconnect(st);
 426         }
 427         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
 428         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
 429         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
 430     }
 431 
 432     if (AM_I_DC) {
 433         fail_incompletable_stonith(transition_graph);
 434         trigger_graph();
 435     }
 436 }
 437 
 438 static void
 439 tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 440 {
 441     if (te_client_id == NULL) {
 442         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 443                                          (unsigned long) getpid());
 444     }
 445 
 446     if (st_event == NULL) {
 447         crm_err("Notify data not found");
 448         return;
 449     }
 450 
 451     crmd_alert_fencing_op(st_event);
 452 
 453     if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
 454         crm_notice("%s was successfully unfenced by %s (at the request of %s)",
 455                    st_event->target,
 456                    st_event->executioner? st_event->executioner : "<anyone>",
 457                    st_event->origin);
 458                 /* TODO: Hook up st_event->device */
 459         return;
 460 
 461     } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
 462         crm_err("Unfencing of %s by %s failed: %s (%d)",
 463                 st_event->target,
 464                 st_event->executioner? st_event->executioner : "<anyone>",
 465                 pcmk_strerror(st_event->result), st_event->result);
 466         return;
 467 
 468     } else if ((st_event->result == pcmk_ok)
 469                && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) {
 470 
 471         /* We were notified of our own fencing. Most likely, either fencing was
 472          * misconfigured, or fabric fencing that doesn't cut cluster
 473          * communication is in use.
 474          *
 475          * Either way, shutting down the local host is a good idea, to require
 476          * administrator intervention. Also, other nodes would otherwise likely
 477          * set our status to lost because of the fencing callback and discard
 478          * our subsequent election votes as "not part of our cluster".
 479          */
 480         crm_crit("We were allegedly just fenced by %s for %s!",
 481                  st_event->executioner? st_event->executioner : "the cluster",
 482                  st_event->origin); /* Dumps blackbox if enabled */
 483         if (fence_reaction_panic) {
 484             pcmk__panic(__func__);
 485         } else {
 486             crm_exit(CRM_EX_FATAL);
 487         }
 488         return;
 489     }
 490 
 491     /* Update the count of stonith failures for this target, in case we become
 492      * DC later. The current DC has already updated its fail count in
 493      * tengine_stonith_callback().
 494      */
 495     if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) {
 496         if (st_event->result == pcmk_ok) {
 497             st_fail_count_reset(st_event->target);
 498         } else {
 499             st_fail_count_increment(st_event->target);
 500         }
 501     }
 502 
 503     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
 504                CRM_XS " initiator=%s ref=%s",
 505                st_event->target, st_event->result == pcmk_ok ? "" : " not",
 506                st_event->action,
 507                st_event->executioner ? st_event->executioner : "<anyone>",
 508                (st_event->client_origin? st_event->client_origin : "<unknown>"),
 509                pcmk_strerror(st_event->result),
 510                st_event->origin, st_event->id);
 511 
 512     if (st_event->result == pcmk_ok) {
 513         crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target,
 514                                                          CRM_GET_PEER_ANY);
 515         const char *uuid = NULL;
 516         gboolean we_are_executioner = pcmk__str_eq(st_event->executioner,
 517                                                    fsa_our_uname,
 518                                                    pcmk__str_casei);
 519 
 520         if (peer == NULL) {
 521             return;
 522         }
 523 
 524         uuid = crm_peer_uuid(peer);
 525 
 526         crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
 527         if(AM_I_DC) {
 528             /* The DC always sends updates */
 529             send_stonith_update(NULL, st_event->target, uuid);
 530 
 531             /* @TODO Ideally, at this point, we'd check whether the fenced node
 532              * hosted any guest nodes, and call remote_node_down() for them.
 533              * Unfortunately, the controller doesn't have a simple, reliable way
 534              * to map hosts to guests. It might be possible to track this in the
 535              * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
 536              * on the scheduler creating fence pseudo-events for the guests.
 537              */
 538 
 539             if (st_event->client_origin
 540                 && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) {
 541 
 542                 /* Abort the current transition graph if it wasn't us
 543                  * that invoked stonith to fence someone
 544                  */
 545                 crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
 546                 abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
 547             }
 548 
 549             /* Assume it was our leader if we don't currently have one */
 550         } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei)
 551                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 552 
 553             crm_notice("Fencing target %s %s our leader",
 554                        st_event->target, (fsa_our_dc? "was" : "may have been"));
 555 
 556             /* Given the CIB resyncing that occurs around elections,
 557              * have one node update the CIB now and, if the new DC is different,
 558              * have them do so too after the election
 559              */
 560             if (we_are_executioner) {
 561                 send_stonith_update(NULL, st_event->target, uuid);
 562             }
 563             add_stonith_cleanup(st_event->target);
 564         }
 565 
 566         /* If the target is a remote node, and we host its connection,
 567          * immediately fail all monitors so it can be recovered quickly.
 568          * The connection won't necessarily drop when a remote node is fenced,
 569          * so the failure might not otherwise be detected until the next poke.
 570          */
 571         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 572             remote_ra_fail(st_event->target);
 573         }
 574 
 575         crmd_peer_down(peer, TRUE);
 576      }
 577 }
 578 
 579 /*!
 580  * \brief Connect to fencer
 581  *
 582  * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
 583  *
 584  * \return TRUE
 585  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 586  *       30 attempts, meaning the controller could be blocked as long as 58s.
 587  */
 588 static gboolean
 589 te_connect_stonith(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 590 {
 591     int rc = pcmk_ok;
 592 
 593     if (stonith_api == NULL) {
 594         stonith_api = stonith_api_new();
 595         if (stonith_api == NULL) {
 596             crm_err("Could not connect to fencer: API memory allocation failed");
 597             return TRUE;
 598         }
 599     }
 600 
 601     if (stonith_api->state != stonith_disconnected) {
 602         crm_trace("Already connected to fencer, no need to retry");
 603         return TRUE;
 604     }
 605 
 606     if (user_data == NULL) {
 607         // Blocking (retry failures now until successful)
 608         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 609         if (rc != pcmk_ok) {
 610             crm_err("Could not connect to fencer in 30 attempts: %s "
 611                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 612         }
 613     } else {
 614         // Non-blocking (retry failures later in main loop)
 615         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 616         if (rc != pcmk_ok) {
 617             if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 618                 crm_notice("Fencer connection failed (will retry): %s "
 619                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 620                 mainloop_set_trigger(stonith_reconnect);
 621             } else {
 622                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 623                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 624             }
 625             return TRUE;
 626         }
 627     }
 628 
 629     if (rc == pcmk_ok) {
 630         stonith_api->cmds->register_notification(stonith_api,
 631                                                  T_STONITH_NOTIFY_DISCONNECT,
 632                                                  tengine_stonith_connection_destroy);
 633         stonith_api->cmds->register_notification(stonith_api,
 634                                                  T_STONITH_NOTIFY_FENCE,
 635                                                  tengine_stonith_notify);
 636         stonith_api->cmds->register_notification(stonith_api,
 637                                                  T_STONITH_NOTIFY_HISTORY_SYNCED,
 638                                                  tengine_stonith_history_synced);
 639         te_trigger_stonith_history_sync(TRUE);
 640         crm_notice("Fencer successfully connected");
 641     }
 642 
 643     return TRUE;
 644 }
 645 
 646 /*!
 647     \internal
 648     \brief Schedule fencer connection attempt in main loop
 649 */
 650 void
 651 controld_trigger_fencer_connect()
     /* [previous][next][first][last][top][bottom][index][help] */
 652 {
 653     if (stonith_reconnect == NULL) {
 654         stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
 655                                                  te_connect_stonith,
 656                                                  GINT_TO_POINTER(TRUE));
 657     }
 658     controld_set_fsa_input_flags(R_ST_REQUIRED);
 659     mainloop_set_trigger(stonith_reconnect);
 660 }
 661 
 662 void
 663 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 664 {
 665     if (stonith_api) {
 666         // Prevent fencer connection from coming up again
 667         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 668 
 669         if (stonith_api->state != stonith_disconnected) {
 670             stonith_api->cmds->disconnect(stonith_api);
 671         }
 672         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
 673         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
 674         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
 675     }
 676     if (destroy) {
 677         if (stonith_api) {
 678             stonith_api->cmds->free(stonith_api);
 679             stonith_api = NULL;
 680         }
 681         if (stonith_reconnect) {
 682             mainloop_destroy_trigger(stonith_reconnect);
 683             stonith_reconnect = NULL;
 684         }
 685         if (te_client_id) {
 686             free(te_client_id);
 687             te_client_id = NULL;
 688         }
 689     }
 690 }
 691 
 692 static gboolean
 693 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 694 {
 695     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 696         stonith_history_t *history = NULL;
 697 
 698         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 699         stonith_api->cmds->history(stonith_api,
 700                                    st_opt_sync_call | st_opt_broadcast,
 701                                    NULL, &history, 5);
 702         stonith_history_free(history);
 703         return TRUE;
 704     } else {
 705         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 706         return FALSE;
 707     }
 708 }
 709 
 710 static void
 711 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 712 {
 713     char *uuid = NULL;
 714     int stonith_id = -1;
 715     int transition_id = -1;
 716     crm_action_t *action = NULL;
 717     int call_id = data->call_id;
 718     int rc = data->rc;
 719     char *userdata = data->userdata;
 720 
 721     CRM_CHECK(userdata != NULL, return);
 722     crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
 723                pcmk_strerror(rc), rc);
 724 
 725     if (AM_I_DC == FALSE) {
 726         return;
 727     }
 728 
 729     /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
 730     /*       op->call_id, op->optype, op->node_name, op->op_result, */
 731     /*       (char *)op->node_list, op->private_data); */
 732 
 733     /* filter out old STONITH actions */
 734     CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
 735               goto bail);
 736 
 737     if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei)
 738         || transition_graph->id != transition_id) {
 739         crm_info("Ignoring STONITH action initiated outside of the current transition");
 740         goto bail;
 741     }
 742 
 743     action = controld_get_action(stonith_id);
 744     if (action == NULL) {
 745         crm_err("Stonith action not matched");
 746         goto bail;
 747     }
 748 
 749     stop_te_timer(action->timer);
 750     if (rc == pcmk_ok) {
 751         const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 752         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 753         const char *op = crm_meta_value(action->params, "stonith_action");
 754 
 755         crm_info("Stonith operation %d for %s passed", call_id, target);
 756         if (!(pcmk_is_set(action->flags, pcmk__graph_action_confirmed))) {
 757             te_action_confirmed(action, NULL);
 758             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
 759                 const char *value = NULL;
 760                 char *now = pcmk__ttoa(time(NULL));
 761                 gboolean is_remote_node = FALSE;
 762 
 763                 /* This check is not 100% reliable, since this node is not
 764                  * guaranteed to have the remote node cached. However, it
 765                  * doesn't have to be reliable, since the attribute manager can
 766                  * learn a node's "remoteness" by other means sooner or later.
 767                  * This allows it to learn more quickly if this node does have
 768                  * the information.
 769                  */
 770                 if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
 771                     is_remote_node = TRUE;
 772                 }
 773 
 774                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 775                              is_remote_node);
 776                 free(now);
 777 
 778                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
 779                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 780                              is_remote_node);
 781 
 782                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
 783                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 784                              is_remote_node);
 785 
 786             } else if (!(pcmk_is_set(action->flags, pcmk__graph_action_sent_update))) {
 787                 send_stonith_update(action, target, uuid);
 788                 crm__set_graph_action_flags(action, pcmk__graph_action_sent_update);
 789             }
 790         }
 791         st_fail_count_reset(target);
 792 
 793     } else {
 794         const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 795         enum transition_action abort_action = tg_restart;
 796 
 797         crm__set_graph_action_flags(action, pcmk__graph_action_failed);
 798         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
 799                    call_id, target, pcmk_strerror(rc));
 800 
 801         /* If no fence devices were available, there's no use in immediately
 802          * checking again, so don't start a new transition in that case.
 803          */
 804         if (rc == -ENODEV) {
 805             crm_warn("No devices found in cluster to fence %s, giving up",
 806                      target);
 807             abort_action = tg_stop;
 808         }
 809 
 810         /* Increment the fail count now, so abort_for_stonith_failure() can
 811          * check it. Non-DC nodes will increment it in tengine_stonith_notify().
 812          */
 813         st_fail_count_increment(target);
 814         abort_for_stonith_failure(abort_action, target, NULL);
 815     }
 816 
 817     pcmk__update_graph(transition_graph, action);
 818     trigger_graph();
 819 
 820   bail:
 821     free(userdata);
 822     free(uuid);
 823     return;
 824 }
 825 
 826 static int
 827 fence_with_delay(const char *target, const char *type, const char *delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 828 {
 829     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 830     int timeout_sec = (int) (transition_graph->stonith_timeout / 1000);
 831     int delay_i;
 832 
 833     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 834         stonith__set_call_options(options, target, st_opt_allow_suicide);
 835     }
 836     pcmk__scan_min_int(delay, &delay_i, 0);
 837     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 838                                                type, timeout_sec, 0, delay_i);
 839 }
 840 
 841 gboolean
 842 te_fence_node(crm_graph_t *graph, crm_action_t *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 843 {
 844     int rc = 0;
 845     const char *id = NULL;
 846     const char *uuid = NULL;
 847     const char *target = NULL;
 848     const char *type = NULL;
 849     char *transition_key = NULL;
 850     const char *priority_delay = NULL;
 851     gboolean invalid_action = FALSE;
 852 
 853     id = ID(action->xml);
 854     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 855     uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 856     type = crm_meta_value(action->params, "stonith_action");
 857 
 858     CRM_CHECK(id != NULL, invalid_action = TRUE);
 859     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 860     CRM_CHECK(type != NULL, invalid_action = TRUE);
 861     CRM_CHECK(target != NULL, invalid_action = TRUE);
 862 
 863     if (invalid_action) {
 864         crm_log_xml_warn(action->xml, "BadAction");
 865         return FALSE;
 866     }
 867 
 868     priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 869 
 870     crm_notice("Requesting fencing (%s) of node %s "
 871                CRM_XS " action=%s timeout=%u%s%s",
 872                type, target, id, transition_graph->stonith_timeout,
 873                priority_delay ? " priority_delay=" : "",
 874                priority_delay ? priority_delay : "");
 875 
 876     /* Passing NULL means block until we can connect... */
 877     te_connect_stonith(NULL);
 878 
 879     rc = fence_with_delay(target, type, priority_delay);
 880     transition_key = pcmk__transition_key(transition_graph->id, action->id, 0,
 881                                           te_uuid),
 882     stonith_api->cmds->register_callback(stonith_api, rc,
 883                                          (int) (transition_graph->stonith_timeout / 1000),
 884                                          st_opt_timeout_updates, transition_key,
 885                                          "tengine_stonith_callback", tengine_stonith_callback);
 886 
 887     return TRUE;
 888 }
 889 
 890 bool
 891 controld_verify_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 892 {
 893     gboolean rv = TRUE;
 894 
 895     if (stonith_api && (stonith_api->state != stonith_disconnected) &&
 896         stonith__watchdog_fencing_enabled_for_node_api(stonith_api,
 897                                                        fsa_our_uname)) {
 898         rv = pcmk__valid_sbd_timeout(value);
 899     }
 900     return rv;
 901 }
 902 
 903 /* end stonith API client functions */
 904 
 905 
 906 /*
 907  * stonith history synchronization
 908  *
 909  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
 910  * joins or leaves, we need to synchronize the history across all nodes.
 911  */
 912 
 913 static crm_trigger_t *stonith_history_sync_trigger = NULL;
 914 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
 915 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
 916 
 917 void
 918 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
 919 {
 920     if (free_timers) {
 921         mainloop_timer_del(stonith_history_sync_timer_short);
 922         stonith_history_sync_timer_short = NULL;
 923         mainloop_timer_del(stonith_history_sync_timer_long);
 924         stonith_history_sync_timer_long = NULL;
 925     } else {
 926         mainloop_timer_stop(stonith_history_sync_timer_short);
 927         mainloop_timer_stop(stonith_history_sync_timer_long);
 928     }
 929 
 930     if (st) {
 931         st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
 932     }
 933 }
 934 
 935 static void
 936 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 937 {
 938     te_cleanup_stonith_history_sync(st, FALSE);
 939     crm_debug("Fence-history synced - cancel all timers");
 940 }
 941 
 942 static gboolean
 943 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 944 {
 945     mainloop_set_trigger(stonith_history_sync_trigger);
 946     return FALSE;
 947 }
 948 
 949 void
 950 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
 951 {
 952     /* trigger a sync in 5s to give more nodes the
 953      * chance to show up so that we don't create
 954      * unnecessary stonith-history-sync traffic
 955      *
 956      * the long timeout of 30s is there as a fallback
 957      * so that after a successful connection to fenced
 958      * we will wait for 30s for the DC to trigger a
 959      * history-sync
 960      * if this doesn't happen we trigger a sync locally
 961      * (e.g. fenced segfaults and is restarted by pacemakerd)
 962      */
 963 
 964     /* as we are finally checking the stonith-connection
 965      * in do_stonith_history_sync we should be fine
 966      * leaving stonith_history_sync_time & stonith_history_sync_trigger
 967      * around
 968      */
 969     if (stonith_history_sync_trigger == NULL) {
 970         stonith_history_sync_trigger =
 971             mainloop_add_trigger(G_PRIORITY_LOW,
 972                                  do_stonith_history_sync, NULL);
 973     }
 974 
 975     if (long_timeout) {
 976         if(stonith_history_sync_timer_long == NULL) {
 977             stonith_history_sync_timer_long =
 978                 mainloop_timer_add("history_sync_long", 30000,
 979                                    FALSE, stonith_history_sync_set_trigger,
 980                                    NULL);
 981         }
 982         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
 983         mainloop_timer_start(stonith_history_sync_timer_long);
 984     } else {
 985         if(stonith_history_sync_timer_short == NULL) {
 986             stonith_history_sync_timer_short =
 987                 mainloop_timer_add("history_sync_short", 5000,
 988                                    FALSE, stonith_history_sync_set_trigger,
 989                                    NULL);
 990         }
 991         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
 992         mainloop_timer_start(stonith_history_sync_timer_short);
 993     }
 994 
 995 }
 996 
 997 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */