root/daemons/controld/controld_fencing.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_stonith_max_attempts
  2. set_fence_reaction
  3. too_many_st_failures
  4. st_fail_count_reset
  5. st_fail_count_increment
  6. cib_fencing_updated
  7. send_stonith_update
  8. abort_for_stonith_failure
  9. add_stonith_cleanup
  10. remove_stonith_cleanup
  11. purge_stonith_cleanup
  12. execute_stonith_cleanup
  13. fail_incompletable_stonith
  14. tengine_stonith_connection_destroy
  15. tengine_stonith_notify
  16. te_connect_stonith
  17. controld_trigger_fencer_connect
  18. controld_disconnect_fencer
  19. do_stonith_history_sync
  20. tengine_stonith_callback
  21. fence_with_delay
  22. te_fence_node
  23. te_cleanup_stonith_history_sync
  24. tengine_stonith_history_synced
  25. stonith_history_sync_set_trigger
  26. te_trigger_stonith_history_sync

   1 /*
   2  * Copyright 2004-2021 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 #include <crm/crm.h>
  12 #include <crm/msg_xml.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/fencing/internal.h>
  15 
  16 #include <pacemaker-controld.h>
  17 
  18 static void
  19 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event);
  20 
  21 /*
  22  * stonith failure counting
  23  *
  24  * We don't want to get stuck in a permanent fencing loop. Keep track of the
  25  * number of fencing failures for each target node, and the most we'll restart a
  26  * transition for.
  27  */
  28 
  29 struct st_fail_rec {
  30     int count;
  31 };
  32 
  33 static bool fence_reaction_panic = FALSE;
  34 static unsigned long int stonith_max_attempts = 10;
  35 static GHashTable *stonith_failures = NULL;
  36 
  37 void
  38 update_stonith_max_attempts(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
  39 {
  40     stonith_max_attempts = char2score(value);
  41     if (stonith_max_attempts < 1UL) {
  42         stonith_max_attempts = 10UL;
  43     }
  44 }
  45 
  46 void
  47 set_fence_reaction(const char *reaction_s)
     /* [previous][next][first][last][top][bottom][index][help] */
  48 {
  49     if (pcmk__str_eq(reaction_s, "panic", pcmk__str_casei)) {
  50         fence_reaction_panic = TRUE;
  51 
  52     } else {
  53         if (!pcmk__str_eq(reaction_s, "stop", pcmk__str_casei)) {
  54             crm_warn("Invalid value '%s' for %s, using 'stop'",
  55                      reaction_s, XML_CONFIG_ATTR_FENCE_REACTION);
  56         }
  57         fence_reaction_panic = FALSE;
  58     }
  59 }
  60 
  61 static gboolean
  62 too_many_st_failures(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
  63 {
  64     GHashTableIter iter;
  65     const char *key = NULL;
  66     struct st_fail_rec *value = NULL;
  67 
  68     if (stonith_failures == NULL) {
  69         return FALSE;
  70     }
  71 
  72     if (target == NULL) {
  73         g_hash_table_iter_init(&iter, stonith_failures);
  74         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
  75                (gpointer *) &value)) {
  76 
  77             if (value->count >= stonith_max_attempts) {
  78                 target = (const char*)key;
  79                 goto too_many;
  80             }
  81         }
  82     } else {
  83         value = g_hash_table_lookup(stonith_failures, target);
  84         if ((value != NULL) && (value->count >= stonith_max_attempts)) {
  85             goto too_many;
  86         }
  87     }
  88     return FALSE;
  89 
  90 too_many:
  91     crm_warn("Too many failures (%d) to fence %s, giving up",
  92              value->count, target);
  93     return TRUE;
  94 }
  95 
  96 /*!
  97  * \internal
  98  * \brief Reset a stonith fail count
  99  *
 100  * \param[in] target  Name of node to reset, or NULL for all
 101  */
 102 void
 103 st_fail_count_reset(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 104 {
 105     if (stonith_failures == NULL) {
 106         return;
 107     }
 108 
 109     if (target) {
 110         struct st_fail_rec *rec = NULL;
 111 
 112         rec = g_hash_table_lookup(stonith_failures, target);
 113         if (rec) {
 114             rec->count = 0;
 115         }
 116     } else {
 117         GHashTableIter iter;
 118         const char *key = NULL;
 119         struct st_fail_rec *rec = NULL;
 120 
 121         g_hash_table_iter_init(&iter, stonith_failures);
 122         while (g_hash_table_iter_next(&iter, (gpointer *) &key,
 123                                       (gpointer *) &rec)) {
 124             rec->count = 0;
 125         }
 126     }
 127 }
 128 
 129 static void
 130 st_fail_count_increment(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 131 {
 132     struct st_fail_rec *rec = NULL;
 133 
 134     if (stonith_failures == NULL) {
 135         stonith_failures = pcmk__strkey_table(free, free);
 136     }
 137 
 138     rec = g_hash_table_lookup(stonith_failures, target);
 139     if (rec) {
 140         rec->count++;
 141     } else {
 142         rec = malloc(sizeof(struct st_fail_rec));
 143         if(rec == NULL) {
 144             return;
 145         }
 146 
 147         rec->count = 1;
 148         g_hash_table_insert(stonith_failures, strdup(target), rec);
 149     }
 150 }
 151 
 152 /* end stonith fail count functions */
 153 
 154 
 155 static void
 156 cib_fencing_updated(xmlNode *msg, int call_id, int rc, xmlNode *output,
     /* [previous][next][first][last][top][bottom][index][help] */
 157                     void *user_data)
 158 {
 159     if (rc < pcmk_ok) {
 160         crm_err("Fencing update %d for %s: failed - %s (%d)",
 161                 call_id, (char *)user_data, pcmk_strerror(rc), rc);
 162         crm_log_xml_warn(msg, "Failed update");
 163         abort_transition(INFINITY, tg_shutdown, "CIB update failed", NULL);
 164 
 165     } else {
 166         crm_info("Fencing update %d for %s: complete", call_id, (char *)user_data);
 167     }
 168 }
 169 
 170 static void
 171 send_stonith_update(crm_action_t *action, const char *target, const char *uuid)
     /* [previous][next][first][last][top][bottom][index][help] */
 172 {
 173     int rc = pcmk_ok;
 174     crm_node_t *peer = NULL;
 175 
 176     /* We (usually) rely on the membership layer to do node_update_cluster,
 177      * and the peer status callback to do node_update_peer, because the node
 178      * might have already rejoined before we get the stonith result here.
 179      */
 180     int flags = node_update_join | node_update_expected;
 181 
 182     /* zero out the node-status & remove all LRM status info */
 183     xmlNode *node_state = NULL;
 184 
 185     CRM_CHECK(target != NULL, return);
 186     CRM_CHECK(uuid != NULL, return);
 187 
 188     /* Make sure the membership and join caches are accurate */
 189     peer = crm_get_peer_full(0, target, CRM_GET_PEER_ANY);
 190 
 191     CRM_CHECK(peer != NULL, return);
 192 
 193     if (peer->state == NULL) {
 194         /* Usually, we rely on the membership layer to update the cluster state
 195          * in the CIB. However, if the node has never been seen, do it here, so
 196          * the node is not considered unclean.
 197          */
 198         flags |= node_update_cluster;
 199     }
 200 
 201     if (peer->uuid == NULL) {
 202         crm_info("Recording uuid '%s' for node '%s'", uuid, target);
 203         peer->uuid = strdup(uuid);
 204     }
 205 
 206     crmd_peer_down(peer, TRUE);
 207 
 208     /* Generate a node state update for the CIB */
 209     node_state = create_node_state_update(peer, flags, NULL, __func__);
 210 
 211     /* we have to mark whether or not remote nodes have already been fenced */
 212     if (peer->flags & crm_remote_node) {
 213         char *now_s = pcmk__ttoa(time(NULL));
 214 
 215         crm_xml_add(node_state, XML_NODE_IS_FENCED, now_s);
 216         free(now_s);
 217     }
 218 
 219     /* Force our known ID */
 220     crm_xml_add(node_state, XML_ATTR_UUID, uuid);
 221 
 222     rc = fsa_cib_conn->cmds->update(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state,
 223                                     cib_quorum_override | cib_scope_local | cib_can_create);
 224 
 225     /* Delay processing the trigger until the update completes */
 226     crm_debug("Sending fencing update %d for %s", rc, target);
 227     fsa_register_cib_callback(rc, FALSE, strdup(target), cib_fencing_updated);
 228 
 229     /* Make sure it sticks */
 230     /* fsa_cib_conn->cmds->bump_epoch(fsa_cib_conn, cib_quorum_override|cib_scope_local);    */
 231 
 232     controld_delete_node_state(peer->uname, controld_section_all,
 233                                cib_scope_local);
 234     free_xml(node_state);
 235     return;
 236 }
 237 
 238 /*!
 239  * \internal
 240  * \brief Abort transition due to stonith failure
 241  *
 242  * \param[in] abort_action  Whether to restart or stop transition
 243  * \param[in] target  Don't restart if this (NULL for any) has too many failures
 244  * \param[in] reason  Log this stonith action XML as abort reason (or NULL)
 245  */
 246 static void
 247 abort_for_stonith_failure(enum transition_action abort_action,
     /* [previous][next][first][last][top][bottom][index][help] */
 248                           const char *target, xmlNode *reason)
 249 {
 250     /* If stonith repeatedly fails, we eventually give up on starting a new
 251      * transition for that reason.
 252      */
 253     if ((abort_action != tg_stop) && too_many_st_failures(target)) {
 254         abort_action = tg_stop;
 255     }
 256     abort_transition(INFINITY, abort_action, "Stonith failed", reason);
 257 }
 258 
 259 
 260 /*
 261  * stonith cleanup list
 262  *
 263  * If the DC is shot, proper notifications might not go out.
 264  * The stonith cleanup list allows the cluster to (re-)send
 265  * notifications once a new DC is elected.
 266  */
 267 
 268 static GList *stonith_cleanup_list = NULL;
 269 
 270 /*!
 271  * \internal
 272  * \brief Add a node to the stonith cleanup list
 273  *
 274  * \param[in] target  Name of node to add
 275  */
 276 void
 277 add_stonith_cleanup(const char *target) {
     /* [previous][next][first][last][top][bottom][index][help] */
 278     stonith_cleanup_list = g_list_append(stonith_cleanup_list, strdup(target));
 279 }
 280 
 281 /*!
 282  * \internal
 283  * \brief Remove a node from the stonith cleanup list
 284  *
 285  * \param[in] Name of node to remove
 286  */
 287 void
 288 remove_stonith_cleanup(const char *target)
     /* [previous][next][first][last][top][bottom][index][help] */
 289 {
 290     GList *iter = stonith_cleanup_list;
 291 
 292     while (iter != NULL) {
 293         GList *tmp = iter;
 294         char *iter_name = tmp->data;
 295 
 296         iter = iter->next;
 297         if (pcmk__str_eq(target, iter_name, pcmk__str_casei)) {
 298             crm_trace("Removing %s from the cleanup list", iter_name);
 299             stonith_cleanup_list = g_list_delete_link(stonith_cleanup_list, tmp);
 300             free(iter_name);
 301         }
 302     }
 303 }
 304 
 305 /*!
 306  * \internal
 307  * \brief Purge all entries from the stonith cleanup list
 308  */
 309 void
 310 purge_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 311 {
 312     if (stonith_cleanup_list) {
 313         GList *iter = NULL;
 314 
 315         for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 316             char *target = iter->data;
 317 
 318             crm_info("Purging %s from stonith cleanup list", target);
 319             free(target);
 320         }
 321         g_list_free(stonith_cleanup_list);
 322         stonith_cleanup_list = NULL;
 323     }
 324 }
 325 
 326 /*!
 327  * \internal
 328  * \brief Send stonith updates for all entries in cleanup list, then purge it
 329  */
 330 void
 331 execute_stonith_cleanup()
     /* [previous][next][first][last][top][bottom][index][help] */
 332 {
 333     GList *iter;
 334 
 335     for (iter = stonith_cleanup_list; iter != NULL; iter = iter->next) {
 336         char *target = iter->data;
 337         crm_node_t *target_node = crm_get_peer(0, target);
 338         const char *uuid = crm_peer_uuid(target_node);
 339 
 340         crm_notice("Marking %s, target of a previous stonith action, as clean", target);
 341         send_stonith_update(NULL, target, uuid);
 342         free(target);
 343     }
 344     g_list_free(stonith_cleanup_list);
 345     stonith_cleanup_list = NULL;
 346 }
 347 
 348 /* end stonith cleanup list functions */
 349 
 350 
 351 /* stonith API client
 352  *
 353  * Functions that need to interact directly with the fencer via its API
 354  */
 355 
 356 static stonith_t *stonith_api = NULL;
 357 static crm_trigger_t *stonith_reconnect = NULL;
 358 static char *te_client_id = NULL;
 359 
 360 static gboolean
 361 fail_incompletable_stonith(crm_graph_t *graph)
     /* [previous][next][first][last][top][bottom][index][help] */
 362 {
 363     GList *lpc = NULL;
 364     const char *task = NULL;
 365     xmlNode *last_action = NULL;
 366 
 367     if (graph == NULL) {
 368         return FALSE;
 369     }
 370 
 371     for (lpc = graph->synapses; lpc != NULL; lpc = lpc->next) {
 372         GList *lpc2 = NULL;
 373         synapse_t *synapse = (synapse_t *) lpc->data;
 374 
 375         if (synapse->confirmed) {
 376             continue;
 377         }
 378 
 379         for (lpc2 = synapse->actions; lpc2 != NULL; lpc2 = lpc2->next) {
 380             crm_action_t *action = (crm_action_t *) lpc2->data;
 381 
 382             if (action->type != action_type_crm || action->confirmed) {
 383                 continue;
 384             }
 385 
 386             task = crm_element_value(action->xml, XML_LRM_ATTR_TASK);
 387             if (task && pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
 388                 action->failed = TRUE;
 389                 last_action = action->xml;
 390                 update_graph(graph, action);
 391                 crm_notice("Failing action %d (%s): fencer terminated",
 392                            action->id, ID(action->xml));
 393             }
 394         }
 395     }
 396 
 397     if (last_action != NULL) {
 398         crm_warn("Fencer failure resulted in unrunnable actions");
 399         abort_for_stonith_failure(tg_restart, NULL, last_action);
 400         return TRUE;
 401     }
 402 
 403     return FALSE;
 404 }
 405 
 406 static void
 407 tengine_stonith_connection_destroy(stonith_t *st, stonith_event_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 408 {
 409     te_cleanup_stonith_history_sync(st, FALSE);
 410 
 411     if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 412         crm_crit("Fencing daemon connection failed");
 413         mainloop_set_trigger(stonith_reconnect);
 414 
 415     } else {
 416         crm_info("Fencing daemon disconnected");
 417     }
 418 
 419     if (stonith_api) {
 420         /* the client API won't properly reconnect notifications
 421          * if they are still in the table - so remove them
 422          */
 423         if (stonith_api->state != stonith_disconnected) {
 424             stonith_api->cmds->disconnect(st);
 425         }
 426         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
 427         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
 428         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
 429     }
 430 
 431     if (AM_I_DC) {
 432         fail_incompletable_stonith(transition_graph);
 433         trigger_graph();
 434     }
 435 }
 436 
 437 static void
 438 tengine_stonith_notify(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 439 {
 440     if (te_client_id == NULL) {
 441         te_client_id = crm_strdup_printf("%s.%lu", crm_system_name,
 442                                          (unsigned long) getpid());
 443     }
 444 
 445     if (st_event == NULL) {
 446         crm_err("Notify data not found");
 447         return;
 448     }
 449 
 450     crmd_alert_fencing_op(st_event);
 451 
 452     if ((st_event->result == pcmk_ok) && pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
 453         crm_notice("%s was successfully unfenced by %s (at the request of %s)",
 454                    st_event->target,
 455                    st_event->executioner? st_event->executioner : "<anyone>",
 456                    st_event->origin);
 457                 /* TODO: Hook up st_event->device */
 458         return;
 459 
 460     } else if (pcmk__str_eq("on", st_event->action, pcmk__str_casei)) {
 461         crm_err("Unfencing of %s by %s failed: %s (%d)",
 462                 st_event->target,
 463                 st_event->executioner? st_event->executioner : "<anyone>",
 464                 pcmk_strerror(st_event->result), st_event->result);
 465         return;
 466 
 467     } else if ((st_event->result == pcmk_ok)
 468                && pcmk__str_eq(st_event->target, fsa_our_uname, pcmk__str_none)) {
 469 
 470         /* We were notified of our own fencing. Most likely, either fencing was
 471          * misconfigured, or fabric fencing that doesn't cut cluster
 472          * communication is in use.
 473          *
 474          * Either way, shutting down the local host is a good idea, to require
 475          * administrator intervention. Also, other nodes would otherwise likely
 476          * set our status to lost because of the fencing callback and discard
 477          * our subsequent election votes as "not part of our cluster".
 478          */
 479         crm_crit("We were allegedly just fenced by %s for %s!",
 480                  st_event->executioner? st_event->executioner : "the cluster",
 481                  st_event->origin); /* Dumps blackbox if enabled */
 482         if (fence_reaction_panic) {
 483             pcmk__panic(__func__);
 484         } else {
 485             crm_exit(CRM_EX_FATAL);
 486         }
 487         return;
 488     }
 489 
 490     /* Update the count of stonith failures for this target, in case we become
 491      * DC later. The current DC has already updated its fail count in
 492      * tengine_stonith_callback().
 493      */
 494     if (!AM_I_DC && pcmk__str_eq(st_event->operation, T_STONITH_NOTIFY_FENCE, pcmk__str_casei)) {
 495         if (st_event->result == pcmk_ok) {
 496             st_fail_count_reset(st_event->target);
 497         } else {
 498             st_fail_count_increment(st_event->target);
 499         }
 500     }
 501 
 502     crm_notice("Peer %s was%s terminated (%s) by %s on behalf of %s: %s "
 503                CRM_XS " initiator=%s ref=%s",
 504                st_event->target, st_event->result == pcmk_ok ? "" : " not",
 505                st_event->action,
 506                st_event->executioner ? st_event->executioner : "<anyone>",
 507                (st_event->client_origin? st_event->client_origin : "<unknown>"),
 508                pcmk_strerror(st_event->result),
 509                st_event->origin, st_event->id);
 510 
 511     if (st_event->result == pcmk_ok) {
 512         crm_node_t *peer = pcmk__search_known_node_cache(0, st_event->target,
 513                                                          CRM_GET_PEER_ANY);
 514         const char *uuid = NULL;
 515         gboolean we_are_executioner = pcmk__str_eq(st_event->executioner,
 516                                                    fsa_our_uname,
 517                                                    pcmk__str_casei);
 518 
 519         if (peer == NULL) {
 520             return;
 521         }
 522 
 523         uuid = crm_peer_uuid(peer);
 524 
 525         crm_trace("target=%s dc=%s", st_event->target, fsa_our_dc);
 526         if(AM_I_DC) {
 527             /* The DC always sends updates */
 528             send_stonith_update(NULL, st_event->target, uuid);
 529 
 530             /* @TODO Ideally, at this point, we'd check whether the fenced node
 531              * hosted any guest nodes, and call remote_node_down() for them.
 532              * Unfortunately, the controller doesn't have a simple, reliable way
 533              * to map hosts to guests. It might be possible to track this in the
 534              * peer cache via crm_remote_peer_cache_refresh(). For now, we rely
 535              * on the scheduler creating fence pseudo-events for the guests.
 536              */
 537 
 538             if (st_event->client_origin
 539                 && !pcmk__str_eq(st_event->client_origin, te_client_id, pcmk__str_casei)) {
 540 
 541                 /* Abort the current transition graph if it wasn't us
 542                  * that invoked stonith to fence someone
 543                  */
 544                 crm_info("External fencing operation from %s fenced %s", st_event->client_origin, st_event->target);
 545                 abort_transition(INFINITY, tg_restart, "External Fencing Operation", NULL);
 546             }
 547 
 548             /* Assume it was our leader if we don't currently have one */
 549         } else if (pcmk__str_eq(fsa_our_dc, st_event->target, pcmk__str_null_matches | pcmk__str_casei)
 550                    && !pcmk_is_set(peer->flags, crm_remote_node)) {
 551 
 552             crm_notice("Fencing target %s %s our leader",
 553                        st_event->target, (fsa_our_dc? "was" : "may have been"));
 554 
 555             /* Given the CIB resyncing that occurs around elections,
 556              * have one node update the CIB now and, if the new DC is different,
 557              * have them do so too after the election
 558              */
 559             if (we_are_executioner) {
 560                 send_stonith_update(NULL, st_event->target, uuid);
 561             }
 562             add_stonith_cleanup(st_event->target);
 563         }
 564 
 565         /* If the target is a remote node, and we host its connection,
 566          * immediately fail all monitors so it can be recovered quickly.
 567          * The connection won't necessarily drop when a remote node is fenced,
 568          * so the failure might not otherwise be detected until the next poke.
 569          */
 570         if (pcmk_is_set(peer->flags, crm_remote_node)) {
 571             remote_ra_fail(st_event->target);
 572         }
 573 
 574         crmd_peer_down(peer, TRUE);
 575      }
 576 }
 577 
 578 /*!
 579  * \brief Connect to fencer
 580  *
 581  * \param[in] user_data  If NULL, retry failures now, otherwise retry in main loop
 582  *
 583  * \return TRUE
 584  * \note If user_data is NULL, this will wait 2s between attempts, for up to
 585  *       30 attempts, meaning the controller could be blocked as long as 58s.
 586  */
 587 static gboolean
 588 te_connect_stonith(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 589 {
 590     int rc = pcmk_ok;
 591 
 592     if (stonith_api == NULL) {
 593         stonith_api = stonith_api_new();
 594         if (stonith_api == NULL) {
 595             crm_err("Could not connect to fencer: API memory allocation failed");
 596             return TRUE;
 597         }
 598     }
 599 
 600     if (stonith_api->state != stonith_disconnected) {
 601         crm_trace("Already connected to fencer, no need to retry");
 602         return TRUE;
 603     }
 604 
 605     if (user_data == NULL) {
 606         // Blocking (retry failures now until successful)
 607         rc = stonith_api_connect_retry(stonith_api, crm_system_name, 30);
 608         if (rc != pcmk_ok) {
 609             crm_err("Could not connect to fencer in 30 attempts: %s "
 610                     CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 611         }
 612     } else {
 613         // Non-blocking (retry failures later in main loop)
 614         rc = stonith_api->cmds->connect(stonith_api, crm_system_name, NULL);
 615         if (rc != pcmk_ok) {
 616             if (pcmk_is_set(fsa_input_register, R_ST_REQUIRED)) {
 617                 crm_notice("Fencer connection failed (will retry): %s "
 618                            CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 619                 mainloop_set_trigger(stonith_reconnect);
 620             } else {
 621                 crm_info("Fencer connection failed (ignoring because no longer required): %s "
 622                          CRM_XS " rc=%d", pcmk_strerror(rc), rc);
 623             }
 624             return TRUE;
 625         }
 626     }
 627 
 628     if (rc == pcmk_ok) {
 629         stonith_api->cmds->register_notification(stonith_api,
 630                                                  T_STONITH_NOTIFY_DISCONNECT,
 631                                                  tengine_stonith_connection_destroy);
 632         stonith_api->cmds->register_notification(stonith_api,
 633                                                  T_STONITH_NOTIFY_FENCE,
 634                                                  tengine_stonith_notify);
 635         stonith_api->cmds->register_notification(stonith_api,
 636                                                  T_STONITH_NOTIFY_HISTORY_SYNCED,
 637                                                  tengine_stonith_history_synced);
 638         te_trigger_stonith_history_sync(TRUE);
 639         crm_notice("Fencer successfully connected");
 640     }
 641 
 642     return TRUE;
 643 }
 644 
 645 /*!
 646     \internal
 647     \brief Schedule fencer connection attempt in main loop
 648 */
 649 void
 650 controld_trigger_fencer_connect()
     /* [previous][next][first][last][top][bottom][index][help] */
 651 {
 652     if (stonith_reconnect == NULL) {
 653         stonith_reconnect = mainloop_add_trigger(G_PRIORITY_LOW,
 654                                                  te_connect_stonith,
 655                                                  GINT_TO_POINTER(TRUE));
 656     }
 657     controld_set_fsa_input_flags(R_ST_REQUIRED);
 658     mainloop_set_trigger(stonith_reconnect);
 659 }
 660 
 661 void
 662 controld_disconnect_fencer(bool destroy)
     /* [previous][next][first][last][top][bottom][index][help] */
 663 {
 664     if (stonith_api) {
 665         // Prevent fencer connection from coming up again
 666         controld_clear_fsa_input_flags(R_ST_REQUIRED);
 667 
 668         if (stonith_api->state != stonith_disconnected) {
 669             stonith_api->cmds->disconnect(stonith_api);
 670         }
 671         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_DISCONNECT);
 672         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_FENCE);
 673         stonith_api->cmds->remove_notification(stonith_api, T_STONITH_NOTIFY_HISTORY_SYNCED);
 674     }
 675     if (destroy) {
 676         if (stonith_api) {
 677             stonith_api->cmds->free(stonith_api);
 678             stonith_api = NULL;
 679         }
 680         if (stonith_reconnect) {
 681             mainloop_destroy_trigger(stonith_reconnect);
 682             stonith_reconnect = NULL;
 683         }
 684         if (te_client_id) {
 685             free(te_client_id);
 686             te_client_id = NULL;
 687         }
 688     }
 689 }
 690 
 691 static gboolean
 692 do_stonith_history_sync(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 693 {
 694     if (stonith_api && (stonith_api->state != stonith_disconnected)) {
 695         stonith_history_t *history = NULL;
 696 
 697         te_cleanup_stonith_history_sync(stonith_api, FALSE);
 698         stonith_api->cmds->history(stonith_api,
 699                                    st_opt_sync_call | st_opt_broadcast,
 700                                    NULL, &history, 5);
 701         stonith_history_free(history);
 702         return TRUE;
 703     } else {
 704         crm_info("Skip triggering stonith history-sync as stonith is disconnected");
 705         return FALSE;
 706     }
 707 }
 708 
 709 static void
 710 tengine_stonith_callback(stonith_t *stonith, stonith_callback_data_t *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 711 {
 712     char *uuid = NULL;
 713     int stonith_id = -1;
 714     int transition_id = -1;
 715     crm_action_t *action = NULL;
 716     int call_id = data->call_id;
 717     int rc = data->rc;
 718     char *userdata = data->userdata;
 719 
 720     CRM_CHECK(userdata != NULL, return);
 721     crm_notice("Stonith operation %d/%s: %s (%d)", call_id, (char *)userdata,
 722                pcmk_strerror(rc), rc);
 723 
 724     if (AM_I_DC == FALSE) {
 725         return;
 726     }
 727 
 728     /* crm_info("call=%d, optype=%d, node_name=%s, result=%d, node_list=%s, action=%s", */
 729     /*       op->call_id, op->optype, op->node_name, op->op_result, */
 730     /*       (char *)op->node_list, op->private_data); */
 731 
 732     /* filter out old STONITH actions */
 733     CRM_CHECK(decode_transition_key(userdata, &uuid, &transition_id, &stonith_id, NULL),
 734               goto bail);
 735 
 736     if (transition_graph->complete || stonith_id < 0 || !pcmk__str_eq(uuid, te_uuid, pcmk__str_casei)
 737         || transition_graph->id != transition_id) {
 738         crm_info("Ignoring STONITH action initiated outside of the current transition");
 739         goto bail;
 740     }
 741 
 742     action = controld_get_action(stonith_id);
 743     if (action == NULL) {
 744         crm_err("Stonith action not matched");
 745         goto bail;
 746     }
 747 
 748     stop_te_timer(action->timer);
 749     if (rc == pcmk_ok) {
 750         const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 751         const char *uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 752         const char *op = crm_meta_value(action->params, "stonith_action");
 753 
 754         crm_info("Stonith operation %d for %s passed", call_id, target);
 755         if (action->confirmed == FALSE) {
 756             te_action_confirmed(action, NULL);
 757             if (pcmk__str_eq("on", op, pcmk__str_casei)) {
 758                 const char *value = NULL;
 759                 char *now = pcmk__ttoa(time(NULL));
 760                 gboolean is_remote_node = FALSE;
 761 
 762                 /* This check is not 100% reliable, since this node is not
 763                  * guaranteed to have the remote node cached. However, it
 764                  * doesn't have to be reliable, since the attribute manager can
 765                  * learn a node's "remoteness" by other means sooner or later.
 766                  * This allows it to learn more quickly if this node does have
 767                  * the information.
 768                  */
 769                 if (g_hash_table_lookup(crm_remote_peer_cache, uuid) != NULL) {
 770                     is_remote_node = TRUE;
 771                 }
 772 
 773                 update_attrd(target, CRM_ATTR_UNFENCED, now, NULL,
 774                              is_remote_node);
 775                 free(now);
 776 
 777                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_ALL);
 778                 update_attrd(target, CRM_ATTR_DIGESTS_ALL, value, NULL,
 779                              is_remote_node);
 780 
 781                 value = crm_meta_value(action->params, XML_OP_ATTR_DIGESTS_SECURE);
 782                 update_attrd(target, CRM_ATTR_DIGESTS_SECURE, value, NULL,
 783                              is_remote_node);
 784 
 785             } else if (action->sent_update == FALSE) {
 786                 send_stonith_update(action, target, uuid);
 787                 action->sent_update = TRUE;
 788             }
 789         }
 790         st_fail_count_reset(target);
 791 
 792     } else {
 793         const char *target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 794         enum transition_action abort_action = tg_restart;
 795 
 796         action->failed = TRUE;
 797         crm_notice("Stonith operation %d for %s failed (%s): aborting transition.",
 798                    call_id, target, pcmk_strerror(rc));
 799 
 800         /* If no fence devices were available, there's no use in immediately
 801          * checking again, so don't start a new transition in that case.
 802          */
 803         if (rc == -ENODEV) {
 804             crm_warn("No devices found in cluster to fence %s, giving up",
 805                      target);
 806             abort_action = tg_stop;
 807         }
 808 
 809         /* Increment the fail count now, so abort_for_stonith_failure() can
 810          * check it. Non-DC nodes will increment it in tengine_stonith_notify().
 811          */
 812         st_fail_count_increment(target);
 813         abort_for_stonith_failure(abort_action, target, NULL);
 814     }
 815 
 816     update_graph(transition_graph, action);
 817     trigger_graph();
 818 
 819   bail:
 820     free(userdata);
 821     free(uuid);
 822     return;
 823 }
 824 
 825 static int
 826 fence_with_delay(const char *target, const char *type, const char *delay)
     /* [previous][next][first][last][top][bottom][index][help] */
 827 {
 828     uint32_t options = st_opt_none; // Group of enum stonith_call_options
 829     int timeout_sec = (int) (transition_graph->stonith_timeout / 1000);
 830     int delay_i;
 831 
 832     if (crmd_join_phase_count(crm_join_confirmed) == 1) {
 833         stonith__set_call_options(options, target, st_opt_allow_suicide);
 834     }
 835     pcmk__scan_min_int(delay, &delay_i, 0);
 836     return stonith_api->cmds->fence_with_delay(stonith_api, options, target,
 837                                                type, timeout_sec, 0, delay_i);
 838 }
 839 
 840 gboolean
 841 te_fence_node(crm_graph_t *graph, crm_action_t *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 842 {
 843     int rc = 0;
 844     const char *id = NULL;
 845     const char *uuid = NULL;
 846     const char *target = NULL;
 847     const char *type = NULL;
 848     char *transition_key = NULL;
 849     const char *priority_delay = NULL;
 850     gboolean invalid_action = FALSE;
 851 
 852     id = ID(action->xml);
 853     target = crm_element_value(action->xml, XML_LRM_ATTR_TARGET);
 854     uuid = crm_element_value(action->xml, XML_LRM_ATTR_TARGET_UUID);
 855     type = crm_meta_value(action->params, "stonith_action");
 856 
 857     CRM_CHECK(id != NULL, invalid_action = TRUE);
 858     CRM_CHECK(uuid != NULL, invalid_action = TRUE);
 859     CRM_CHECK(type != NULL, invalid_action = TRUE);
 860     CRM_CHECK(target != NULL, invalid_action = TRUE);
 861 
 862     if (invalid_action) {
 863         crm_log_xml_warn(action->xml, "BadAction");
 864         return FALSE;
 865     }
 866 
 867     priority_delay = crm_meta_value(action->params, XML_CONFIG_ATTR_PRIORITY_FENCING_DELAY);
 868 
 869     crm_notice("Requesting fencing (%s) of node %s "
 870                CRM_XS " action=%s timeout=%u%s%s",
 871                type, target, id, transition_graph->stonith_timeout,
 872                priority_delay ? " priority_delay=" : "",
 873                priority_delay ? priority_delay : "");
 874 
 875     /* Passing NULL means block until we can connect... */
 876     te_connect_stonith(NULL);
 877 
 878     rc = fence_with_delay(target, type, priority_delay);
 879     transition_key = pcmk__transition_key(transition_graph->id, action->id, 0,
 880                                           te_uuid),
 881     stonith_api->cmds->register_callback(stonith_api, rc,
 882                                          (int) (transition_graph->stonith_timeout / 1000),
 883                                          st_opt_timeout_updates, transition_key,
 884                                          "tengine_stonith_callback", tengine_stonith_callback);
 885 
 886     return TRUE;
 887 }
 888 
 889 /* end stonith API client functions */
 890 
 891 
 892 /*
 893  * stonith history synchronization
 894  *
 895  * Each node's fencer keeps track of a cluster-wide fencing history. When a node
 896  * joins or leaves, we need to synchronize the history across all nodes.
 897  */
 898 
 899 static crm_trigger_t *stonith_history_sync_trigger = NULL;
 900 static mainloop_timer_t *stonith_history_sync_timer_short = NULL;
 901 static mainloop_timer_t *stonith_history_sync_timer_long = NULL;
 902 
 903 void
 904 te_cleanup_stonith_history_sync(stonith_t *st, bool free_timers)
     /* [previous][next][first][last][top][bottom][index][help] */
 905 {
 906     if (free_timers) {
 907         mainloop_timer_del(stonith_history_sync_timer_short);
 908         stonith_history_sync_timer_short = NULL;
 909         mainloop_timer_del(stonith_history_sync_timer_long);
 910         stonith_history_sync_timer_long = NULL;
 911     } else {
 912         mainloop_timer_stop(stonith_history_sync_timer_short);
 913         mainloop_timer_stop(stonith_history_sync_timer_long);
 914     }
 915 
 916     if (st) {
 917         st->cmds->remove_notification(st, T_STONITH_NOTIFY_HISTORY_SYNCED);
 918     }
 919 }
 920 
 921 static void
 922 tengine_stonith_history_synced(stonith_t *st, stonith_event_t *st_event)
     /* [previous][next][first][last][top][bottom][index][help] */
 923 {
 924     te_cleanup_stonith_history_sync(st, FALSE);
 925     crm_debug("Fence-history synced - cancel all timers");
 926 }
 927 
 928 static gboolean
 929 stonith_history_sync_set_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 930 {
 931     mainloop_set_trigger(stonith_history_sync_trigger);
 932     return FALSE;
 933 }
 934 
 935 void
 936 te_trigger_stonith_history_sync(bool long_timeout)
     /* [previous][next][first][last][top][bottom][index][help] */
 937 {
 938     /* trigger a sync in 5s to give more nodes the
 939      * chance to show up so that we don't create
 940      * unnecessary stonith-history-sync traffic
 941      *
 942      * the long timeout of 30s is there as a fallback
 943      * so that after a successful connection to fenced
 944      * we will wait for 30s for the DC to trigger a
 945      * history-sync
 946      * if this doesn't happen we trigger a sync locally
 947      * (e.g. fenced segfaults and is restarted by pacemakerd)
 948      */
 949 
 950     /* as we are finally checking the stonith-connection
 951      * in do_stonith_history_sync we should be fine
 952      * leaving stonith_history_sync_time & stonith_history_sync_trigger
 953      * around
 954      */
 955     if (stonith_history_sync_trigger == NULL) {
 956         stonith_history_sync_trigger =
 957             mainloop_add_trigger(G_PRIORITY_LOW,
 958                                  do_stonith_history_sync, NULL);
 959     }
 960 
 961     if (long_timeout) {
 962         if(stonith_history_sync_timer_long == NULL) {
 963             stonith_history_sync_timer_long =
 964                 mainloop_timer_add("history_sync_long", 30000,
 965                                    FALSE, stonith_history_sync_set_trigger,
 966                                    NULL);
 967         }
 968         crm_info("Fence history will be synchronized cluster-wide within 30 seconds");
 969         mainloop_timer_start(stonith_history_sync_timer_long);
 970     } else {
 971         if(stonith_history_sync_timer_short == NULL) {
 972             stonith_history_sync_timer_short =
 973                 mainloop_timer_add("history_sync_short", 5000,
 974                                    FALSE, stonith_history_sync_set_trigger,
 975                                    NULL);
 976         }
 977         crm_info("Fence history will be synchronized cluster-wide within 5 seconds");
 978         mainloop_timer_start(stonith_history_sync_timer_short);
 979     }
 980 
 981 }
 982 
 983 /* end stonith history synchronization functions */

/* [previous][next][first][last][top][bottom][index][help] */