root/daemons/controld/controld_join_dc.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. join_phase_text
  2. controld_destroy_failed_sync_table
  3. controld_remove_failed_sync_node
  4. record_failed_sync_node
  5. lookup_failed_sync_node
  6. crm_update_peer_join
  7. start_join_round
  8. create_dc_message
  9. join_make_offer
  10. do_dc_join_offer_all
  11. do_dc_join_offer_one
  12. compare_int_fields
  13. do_dc_join_filter_offer
  14. do_dc_join_finalize
  15. free_max_generation
  16. finalize_sync_callback
  17. join_node_state_commit_callback
  18. do_dc_join_ack
  19. finalize_join_for
  20. check_join_state
  21. do_dc_join_final
  22. crmd_join_phase_count
  23. crmd_join_phase_log

   1 /*
   2  * Copyright 2004-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <inttypes.h>               // PRIu32
  13 #include <stdbool.h>                // bool, true, false
  14 #include <stdio.h>                  // NULL
  15 #include <stdlib.h>                 // free(), etc.
  16 
  17 #include <glib.h>                   // gboolean, etc.
  18 #include <libxml/tree.h>            // xmlNode
  19 
  20 #include <crm/crm.h>
  21 
  22 #include <crm/common/xml.h>
  23 #include <crm/cluster.h>
  24 
  25 #include <pacemaker-controld.h>
  26 
  27 static char *max_generation_from = NULL;
  28 static xmlNodePtr max_generation_xml = NULL;
  29 
  30 /*!
  31  * \internal
  32  * \brief Nodes from which a CIB sync has failed since the peer joined
  33  *
  34  * This table is of the form (<tt>node_name -> join_id</tt>). \p node_name is
  35  * the name of a client node from which a CIB \p sync_from() call has failed in
  36  * \p do_dc_join_finalize() since the client joined the cluster as a peer.
  37  * \p join_id is the ID of the join round in which the \p sync_from() failed,
  38  * and is intended for use in nack log messages.
  39  */
  40 static GHashTable *failed_sync_nodes = NULL;
  41 
  42 void finalize_join_for(gpointer key, gpointer value, gpointer user_data);
  43 void finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data);
  44 gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
  45 
  46 /* Numeric counter used to identify join rounds (an unsigned int would be
  47  * appropriate, except we get and set it in XML as int)
  48  */
  49 static int current_join_id = 0;
  50 
  51 /*!
  52  * \internal
  53  * \brief Get log-friendly string equivalent of a controller group join phase
  54  *
  55  * \param[in] phase  Join phase
  56  *
  57  * \return Log-friendly string equivalent of \p phase
  58  */
  59 static const char *
  60 join_phase_text(enum controld_join_phase phase)
     /* [previous][next][first][last][top][bottom][index][help] */
  61 {
  62     switch (phase) {
  63         case controld_join_nack:
  64             return "nack";
  65         case controld_join_none:
  66             return "none";
  67         case controld_join_welcomed:
  68             return "welcomed";
  69         case controld_join_integrated:
  70             return "integrated";
  71         case controld_join_finalized:
  72             return "finalized";
  73         case controld_join_confirmed:
  74             return "confirmed";
  75         default:
  76             return "invalid";
  77     }
  78 }
  79 
  80 /*!
  81  * \internal
  82  * \brief Destroy the hash table containing failed sync nodes
  83  */
  84 void
  85 controld_destroy_failed_sync_table(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  86 {
  87     if (failed_sync_nodes != NULL) {
  88         g_hash_table_destroy(failed_sync_nodes);
  89         failed_sync_nodes = NULL;
  90     }
  91 }
  92 
  93 /*!
  94  * \internal
  95  * \brief Remove a node from the failed sync nodes table if present
  96  *
  97  * \param[in] node_name  Node name to remove
  98  */
  99 void
 100 controld_remove_failed_sync_node(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 101 {
 102     if (failed_sync_nodes != NULL) {
 103         g_hash_table_remove(failed_sync_nodes, (gchar *) node_name);
 104     }
 105 }
 106 
 107 /*!
 108  * \internal
 109  * \brief Add to a hash table a node whose CIB failed to sync
 110  *
 111  * \param[in] node_name  Name of node whose CIB failed to sync
 112  * \param[in] join_id    Join round when the failure occurred
 113  */
 114 static void
 115 record_failed_sync_node(const char *node_name, gint join_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 116 {
 117     if (failed_sync_nodes == NULL) {
 118         failed_sync_nodes = pcmk__strikey_table(g_free, NULL);
 119     }
 120 
 121     /* If the node is already in the table then we failed to nack it during the
 122      * filter offer step
 123      */
 124     CRM_LOG_ASSERT(g_hash_table_insert(failed_sync_nodes, g_strdup(node_name),
 125                                        GINT_TO_POINTER(join_id)));
 126 }
 127 
 128 /*!
 129  * \internal
 130  * \brief Look up a node name in the failed sync table
 131  *
 132  * \param[in]  node_name  Name of node to look up
 133  * \param[out] join_id    Where to store the join ID of when the sync failed
 134  *
 135  * \return Standard Pacemaker return code. Specifically, \p pcmk_rc_ok if the
 136  *         node name was found, or \p pcmk_rc_node_unknown otherwise.
 137  * \note \p *join_id is set to -1 if the node is not found.
 138  */
 139 static int
 140 lookup_failed_sync_node(const char *node_name, gint *join_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 141 {
 142     *join_id = -1;
 143 
 144     if (failed_sync_nodes != NULL) {
 145         gpointer result = g_hash_table_lookup(failed_sync_nodes,
 146                                               (gchar *) node_name);
 147         if (result != NULL) {
 148             *join_id = GPOINTER_TO_INT(result);
 149             return pcmk_rc_ok;
 150         }
 151     }
 152     return pcmk_rc_node_unknown;
 153 }
 154 
 155 void
 156 crm_update_peer_join(const char *source, pcmk__node_status_t *node,
     /* [previous][next][first][last][top][bottom][index][help] */
 157                      enum controld_join_phase phase)
 158 {
 159     enum controld_join_phase last = controld_get_join_phase(node);
 160 
 161     CRM_CHECK(node != NULL, return);
 162 
 163     /* Remote nodes do not participate in joins */
 164     if (pcmk_is_set(node->flags, pcmk__node_status_remote)) {
 165         return;
 166     }
 167 
 168     if (phase == last) {
 169         crm_trace("Node %s join-%d phase is still %s "
 170                   QB_XS " nodeid=%" PRIu32 " source=%s",
 171                   node->name, current_join_id, join_phase_text(last),
 172                   node->cluster_layer_id, source);
 173         return;
 174     }
 175 
 176     if ((phase <= controld_join_none) || (phase == (last + 1))) {
 177         struct controld_node_status_data *data = NULL;
 178 
 179         if (node->user_data == NULL) {
 180             node->user_data =
 181                 pcmk__assert_alloc(1, sizeof(struct controld_node_status_data));
 182         }
 183         data = node->user_data;
 184         data->join_phase = phase;
 185 
 186         crm_trace("Node %s join-%d phase is now %s (was %s) "
 187                   QB_XS " nodeid=%" PRIu32 " source=%s",
 188                   node->name, current_join_id, join_phase_text(phase),
 189                   join_phase_text(last), node->cluster_layer_id,
 190                   source);
 191         return;
 192     }
 193 
 194     crm_warn("Rejecting join-%d phase update for node %s because can't go from "
 195              "%s to %s " QB_XS " nodeid=%" PRIu32 " source=%s",
 196              current_join_id, node->name, join_phase_text(last),
 197              join_phase_text(phase), node->cluster_layer_id, source);
 198 }
 199 
 200 static void
 201 start_join_round(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 202 {
 203     GHashTableIter iter;
 204     pcmk__node_status_t *peer = NULL;
 205 
 206     crm_debug("Starting new join round join-%d", current_join_id);
 207 
 208     g_hash_table_iter_init(&iter, pcmk__peer_cache);
 209     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) {
 210         crm_update_peer_join(__func__, peer, controld_join_none);
 211     }
 212     if (max_generation_from != NULL) {
 213         free(max_generation_from);
 214         max_generation_from = NULL;
 215     }
 216     if (max_generation_xml != NULL) {
 217         pcmk__xml_free(max_generation_xml);
 218         max_generation_xml = NULL;
 219     }
 220     controld_clear_fsa_input_flags(R_HAVE_CIB);
 221 }
 222 
 223 /*!
 224  * \internal
 225  * \brief Create a join message from the DC
 226  *
 227  * \param[in] join_op  Join operation name
 228  * \param[in] host_to  Recipient of message
 229  */
 230 static xmlNode *
 231 create_dc_message(const char *join_op, const char *host_to)
     /* [previous][next][first][last][top][bottom][index][help] */
 232 {
 233     xmlNode *msg = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_DC, host_to,
 234                                      CRM_SYSTEM_CRMD, join_op, NULL);
 235 
 236     /* Identify which election this is a part of */
 237     crm_xml_add_int(msg, PCMK__XA_JOIN_ID, current_join_id);
 238 
 239     /* Add a field specifying whether the DC is shutting down. This keeps the
 240      * joining node from fencing the old DC if it becomes the new DC.
 241      */
 242     pcmk__xe_set_bool_attr(msg, PCMK__XA_DC_LEAVING,
 243                            pcmk_is_set(controld_globals.fsa_input_register,
 244                                        R_SHUTDOWN));
 245     return msg;
 246 }
 247 
 248 static void
 249 join_make_offer(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 250 {
 251     /* @TODO We don't use user_data except to distinguish one particular call
 252      * from others. Make this clearer.
 253      */
 254     xmlNode *offer = NULL;
 255     pcmk__node_status_t *member = (pcmk__node_status_t *) value;
 256 
 257     pcmk__assert(member != NULL);
 258     if (!pcmk__cluster_is_node_active(member)) {
 259         crm_info("Not making join-%d offer to inactive node %s",
 260                  current_join_id, pcmk__s(member->name, "with unknown name"));
 261         if ((member->expected == NULL)
 262             && pcmk__str_eq(member->state, PCMK__VALUE_LOST, pcmk__str_none)) {
 263             /* You would think this unsafe, but in fact this plus an
 264              * active resource is what causes it to be fenced.
 265              *
 266              * Yes, this does mean that any node that dies at the same
 267              * time as the old DC and is not running resource (still)
 268              * won't be fenced.
 269              *
 270              * I'm not happy about this either.
 271              */
 272             pcmk__update_peer_expected(__func__, member, CRMD_JOINSTATE_DOWN);
 273         }
 274         return;
 275     }
 276 
 277     if (member->name == NULL) {
 278         crm_info("Not making join-%d offer to node uuid %s with unknown name",
 279                  current_join_id, member->xml_id);
 280         return;
 281     }
 282 
 283     if (controld_globals.membership_id != controld_globals.peer_seq) {
 284         controld_globals.membership_id = controld_globals.peer_seq;
 285         crm_info("Making join-%d offers based on membership event %llu",
 286                  current_join_id, controld_globals.peer_seq);
 287     }
 288 
 289     if (user_data != NULL) {
 290         enum controld_join_phase phase = controld_get_join_phase(member);
 291 
 292         if (phase > controld_join_none) {
 293             crm_info("Not making join-%d offer to already known node %s (%s)",
 294                      current_join_id, member->name, join_phase_text(phase));
 295             return;
 296         }
 297     }
 298 
 299     crm_update_peer_join(__func__, (pcmk__node_status_t*) member,
 300                          controld_join_none);
 301 
 302     offer = create_dc_message(CRM_OP_JOIN_OFFER, member->name);
 303 
 304     // Advertise our feature set so the joining node can bail if not compatible
 305     crm_xml_add(offer, PCMK_XA_CRM_FEATURE_SET, CRM_FEATURE_SET);
 306 
 307     crm_info("Sending join-%d offer to %s", current_join_id, member->name);
 308     pcmk__cluster_send_message(member, pcmk_ipc_controld, offer);
 309     pcmk__xml_free(offer);
 310 
 311     crm_update_peer_join(__func__, member, controld_join_welcomed);
 312 }
 313 
 314 /*       A_DC_JOIN_OFFER_ALL    */
 315 void
 316 do_dc_join_offer_all(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 317                      enum crmd_fsa_cause cause,
 318                      enum crmd_fsa_state cur_state,
 319                      enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 320 {
 321     int count;
 322 
 323     /* Reset everyone's status back to down or in_ccm in the CIB.
 324      * Any nodes that are active in the CIB but not in the cluster membership
 325      * will be seen as offline by the scheduler anyway.
 326      */
 327     current_join_id++;
 328     start_join_round();
 329 
 330     update_dc(NULL);
 331     if (cause == C_HA_MESSAGE && current_input == I_NODE_JOIN) {
 332         crm_info("A new node joined the cluster");
 333     }
 334     g_hash_table_foreach(pcmk__peer_cache, join_make_offer, NULL);
 335 
 336     count = crmd_join_phase_count(controld_join_welcomed);
 337     crm_info("Waiting on join-%d requests from %d outstanding node%s",
 338              current_join_id, count, pcmk__plural_s(count));
 339 
 340     // Don't waste time by invoking the scheduler yet
 341 }
 342 
 343 /*       A_DC_JOIN_OFFER_ONE    */
 344 void
 345 do_dc_join_offer_one(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 346                      enum crmd_fsa_cause cause,
 347                      enum crmd_fsa_state cur_state,
 348                      enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 349 {
 350     pcmk__node_status_t *member = NULL;
 351     ha_msg_input_t *welcome = NULL;
 352     int count;
 353     const char *join_to = NULL;
 354 
 355     if (msg_data->data == NULL) {
 356         crm_info("Making join-%d offers to any unconfirmed nodes "
 357                  "because an unknown node joined", current_join_id);
 358         g_hash_table_foreach(pcmk__peer_cache, join_make_offer, &member);
 359         check_join_state(cur_state, __func__);
 360         return;
 361     }
 362 
 363     welcome = fsa_typed_data(fsa_dt_ha_msg);
 364     if (welcome == NULL) {
 365         // fsa_typed_data() already logged an error
 366         return;
 367     }
 368 
 369     join_to = crm_element_value(welcome->msg, PCMK__XA_SRC);
 370     if (join_to == NULL) {
 371         crm_err("Can't make join-%d offer to unknown node", current_join_id);
 372         return;
 373     }
 374     member = pcmk__get_node(0, join_to, NULL, pcmk__node_search_cluster_member);
 375 
 376     /* It is possible that a node will have been sick or starting up when the
 377      * original offer was made. However, it will either re-announce itself in
 378      * due course, or we can re-store the original offer on the client.
 379      */
 380 
 381     crm_update_peer_join(__func__, member, controld_join_none);
 382     join_make_offer(NULL, member, NULL);
 383 
 384     /* If the offer isn't to the local node, make an offer to the local node as
 385      * well, to ensure the correct value for max_generation_from.
 386      */
 387     if (!controld_is_local_node(join_to)) {
 388         member = controld_get_local_node_status();
 389         join_make_offer(NULL, member, NULL);
 390     }
 391 
 392     /* This was a genuine join request; cancel any existing transition and
 393      * invoke the scheduler.
 394      */
 395     abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart, "Node join",
 396                      NULL);
 397 
 398     count = crmd_join_phase_count(controld_join_welcomed);
 399     crm_info("Waiting on join-%d requests from %d outstanding node%s",
 400              current_join_id, count, pcmk__plural_s(count));
 401 
 402     // Don't waste time by invoking the scheduler yet
 403 }
 404 
 405 static int
 406 compare_int_fields(xmlNode * left, xmlNode * right, const char *field)
     /* [previous][next][first][last][top][bottom][index][help] */
 407 {
 408     const char *elem_l = crm_element_value(left, field);
 409     const char *elem_r = crm_element_value(right, field);
 410 
 411     long long int_elem_l;
 412     long long int_elem_r;
 413 
 414     int rc = pcmk_rc_ok;
 415 
 416     rc = pcmk__scan_ll(elem_l, &int_elem_l, -1LL);
 417     if (rc != pcmk_rc_ok) { // Shouldn't be possible
 418         crm_warn("Comparing current CIB %s as -1 "
 419                  "because '%s' is not an integer", field, elem_l);
 420     }
 421 
 422     rc = pcmk__scan_ll(elem_r, &int_elem_r, -1LL);
 423     if (rc != pcmk_rc_ok) { // Shouldn't be possible
 424         crm_warn("Comparing joining node's CIB %s as -1 "
 425                  "because '%s' is not an integer", field, elem_r);
 426     }
 427 
 428     if (int_elem_l < int_elem_r) {
 429         return -1;
 430 
 431     } else if (int_elem_l > int_elem_r) {
 432         return 1;
 433     }
 434 
 435     return 0;
 436 }
 437 
 438 /*       A_DC_JOIN_PROCESS_REQ  */
 439 void
 440 do_dc_join_filter_offer(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 441                         enum crmd_fsa_cause cause,
 442                         enum crmd_fsa_state cur_state,
 443                         enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 444 {
 445     xmlNode *generation = NULL;
 446 
 447     int cmp = 0;
 448     int join_id = -1;
 449     int count = 0;
 450     gint value = 0;
 451     gboolean ack_nack_bool = TRUE;
 452     ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);
 453 
 454     const char *join_from = crm_element_value(join_ack->msg, PCMK__XA_SRC);
 455     const char *ref = crm_element_value(join_ack->msg, PCMK_XA_REFERENCE);
 456     const char *join_version = crm_element_value(join_ack->msg,
 457                                                  PCMK_XA_CRM_FEATURE_SET);
 458     pcmk__node_status_t *join_node = NULL;
 459 
 460     if (join_from == NULL) {
 461         crm_err("Ignoring invalid join request without node name");
 462         return;
 463     }
 464     join_node = pcmk__get_node(0, join_from, NULL,
 465                                pcmk__node_search_cluster_member);
 466 
 467     crm_element_value_int(join_ack->msg, PCMK__XA_JOIN_ID, &join_id);
 468     if (join_id != current_join_id) {
 469         crm_debug("Ignoring join-%d request from %s because we are on join-%d",
 470                   join_id, join_from, current_join_id);
 471         check_join_state(cur_state, __func__);
 472         return;
 473     }
 474 
 475     generation = join_ack->xml;
 476     if (max_generation_xml != NULL && generation != NULL) {
 477         int lpc = 0;
 478 
 479         const char *attributes[] = {
 480             PCMK_XA_ADMIN_EPOCH,
 481             PCMK_XA_EPOCH,
 482             PCMK_XA_NUM_UPDATES,
 483         };
 484 
 485         /* It's not obvious that join_ack->xml is the PCMK__XE_GENERATION_TUPLE
 486          * element from the join client. The "if" guard is for clarity.
 487          */
 488         if (pcmk__xe_is(generation, PCMK__XE_GENERATION_TUPLE)) {
 489             for (lpc = 0; cmp == 0 && lpc < PCMK__NELEM(attributes); lpc++) {
 490                 cmp = compare_int_fields(max_generation_xml, generation,
 491                                          attributes[lpc]);
 492             }
 493 
 494         } else {    // Should always be PCMK__XE_GENERATION_TUPLE
 495             CRM_LOG_ASSERT(false);
 496         }
 497     }
 498 
 499     if (ref == NULL) {
 500         ref = "none"; // for logging only
 501     }
 502 
 503     if (lookup_failed_sync_node(join_from, &value) == pcmk_rc_ok) {
 504         crm_err("Rejecting join-%d request from node %s because we failed to "
 505                 "sync its CIB in join-%d " QB_XS " ref=%s",
 506                 join_id, join_from, value, ref);
 507         ack_nack_bool = FALSE;
 508 
 509     } else if (!pcmk__cluster_is_node_active(join_node)) {
 510         if (match_down_event(join_from) != NULL) {
 511             /* The join request was received after the node was fenced or
 512              * otherwise shutdown in a way that we're aware of. No need to log
 513              * an error in this rare occurrence; we know the client was recently
 514              * shut down, and receiving a lingering in-flight request is not
 515              * cause for alarm.
 516              */
 517             crm_debug("Rejecting join-%d request from inactive node %s "
 518                       QB_XS " ref=%s", join_id, join_from, ref);
 519         } else {
 520             crm_err("Rejecting join-%d request from inactive node %s "
 521                     QB_XS " ref=%s", join_id, join_from, ref);
 522         }
 523         ack_nack_bool = FALSE;
 524 
 525     } else if (generation == NULL) {
 526         crm_err("Rejecting invalid join-%d request from node %s "
 527                 "missing CIB generation " QB_XS " ref=%s",
 528                 join_id, join_from, ref);
 529         ack_nack_bool = FALSE;
 530 
 531     } else if ((join_version == NULL)
 532                || !feature_set_compatible(CRM_FEATURE_SET, join_version)) {
 533         crm_err("Rejecting join-%d request from node %s because feature set %s"
 534                 " is incompatible with ours (%s) " QB_XS " ref=%s",
 535                 join_id, join_from, (join_version? join_version : "pre-3.1.0"),
 536                 CRM_FEATURE_SET, ref);
 537         ack_nack_bool = FALSE;
 538 
 539     } else if (max_generation_xml == NULL) {
 540         const char *validation = crm_element_value(generation,
 541                                                    PCMK_XA_VALIDATE_WITH);
 542 
 543         if (pcmk__get_schema(validation) == NULL) {
 544             crm_err("Rejecting join-%d request from %s (with first CIB "
 545                     "generation) due to %s schema version %s " QB_XS " ref=%s",
 546                     join_id, join_from,
 547                     ((validation == NULL)? "missing" : "unknown"),
 548                     pcmk__s(validation, ""), ref);
 549             ack_nack_bool = FALSE;
 550 
 551         } else {
 552             crm_debug("Accepting join-%d request from %s (with first CIB "
 553                       "generation) " QB_XS " ref=%s",
 554                       join_id, join_from, ref);
 555             max_generation_xml = pcmk__xml_copy(NULL, generation);
 556             pcmk__str_update(&max_generation_from, join_from);
 557         }
 558 
 559     } else if ((cmp < 0)
 560                || ((cmp == 0) && controld_is_local_node(join_from))) {
 561         const char *validation = crm_element_value(generation,
 562                                                    PCMK_XA_VALIDATE_WITH);
 563 
 564         if (pcmk__get_schema(validation) == NULL) {
 565             crm_err("Rejecting join-%d request from %s (with better CIB "
 566                     "generation than current best from %s) due to %s "
 567                     "schema version %s " QB_XS " ref=%s",
 568                     join_id, join_from, max_generation_from,
 569                     ((validation == NULL)? "missing" : "unknown"),
 570                     pcmk__s(validation, ""), ref);
 571             ack_nack_bool = FALSE;
 572 
 573         } else {
 574             crm_debug("Accepting join-%d request from %s (with better CIB "
 575                       "generation than current best from %s) " QB_XS " ref=%s",
 576                       join_id, join_from, max_generation_from, ref);
 577             crm_log_xml_debug(max_generation_xml, "Old max generation");
 578             crm_log_xml_debug(generation, "New max generation");
 579 
 580             pcmk__xml_free(max_generation_xml);
 581             max_generation_xml = pcmk__xml_copy(NULL, join_ack->xml);
 582             pcmk__str_update(&max_generation_from, join_from);
 583         }
 584 
 585     } else {
 586         crm_debug("Accepting join-%d request from %s " QB_XS " ref=%s",
 587                   join_id, join_from, ref);
 588     }
 589 
 590     if (!ack_nack_bool) {
 591         crm_update_peer_join(__func__, join_node, controld_join_nack);
 592         pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_NACK);
 593 
 594     } else {
 595         crm_update_peer_join(__func__, join_node, controld_join_integrated);
 596         pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_MEMBER);
 597     }
 598 
 599     count = crmd_join_phase_count(controld_join_integrated);
 600     crm_debug("%d node%s currently integrated in join-%d",
 601               count, pcmk__plural_s(count), join_id);
 602 
 603     if (check_join_state(cur_state, __func__) == FALSE) {
 604         // Don't waste time by invoking the scheduler yet
 605         count = crmd_join_phase_count(controld_join_welcomed);
 606         crm_debug("Waiting on join-%d requests from %d outstanding node%s",
 607                   join_id, count, pcmk__plural_s(count));
 608     }
 609 }
 610 
 611 /*      A_DC_JOIN_FINALIZE      */
 612 void
 613 do_dc_join_finalize(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 614                     enum crmd_fsa_cause cause,
 615                     enum crmd_fsa_state cur_state,
 616                     enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 617 {
 618     char *sync_from = NULL;
 619     int rc = pcmk_ok;
 620     int count_welcomed = crmd_join_phase_count(controld_join_welcomed);
 621     int count_finalizable = crmd_join_phase_count(controld_join_integrated)
 622                             + crmd_join_phase_count(controld_join_nack);
 623 
 624     /* This we can do straight away and avoid clients timing us out
 625      *  while we compute the latest CIB
 626      */
 627     if (count_welcomed != 0) {
 628         crm_debug("Waiting on join-%d requests from %d outstanding node%s "
 629                   "before finalizing join", current_join_id, count_welcomed,
 630                   pcmk__plural_s(count_welcomed));
 631         crmd_join_phase_log(LOG_DEBUG);
 632         /* crmd_fsa_stall(FALSE); Needed? */
 633         return;
 634 
 635     } else if (count_finalizable == 0) {
 636         crm_debug("Finalization not needed for join-%d at the current time",
 637                   current_join_id);
 638         crmd_join_phase_log(LOG_DEBUG);
 639         check_join_state(controld_globals.fsa_state, __func__);
 640         return;
 641     }
 642 
 643     controld_clear_fsa_input_flags(R_HAVE_CIB);
 644     if ((max_generation_from == NULL)
 645         || controld_is_local_node(max_generation_from)) {
 646         controld_set_fsa_input_flags(R_HAVE_CIB);
 647     }
 648 
 649     if (!controld_globals.transition_graph->complete) {
 650         crm_warn("Delaying join-%d finalization while transition in progress",
 651                  current_join_id);
 652         crmd_join_phase_log(LOG_DEBUG);
 653         crmd_fsa_stall(FALSE);
 654         return;
 655     }
 656 
 657     if (pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) {
 658         // Send our CIB out to everyone
 659         sync_from = pcmk__str_copy(controld_globals.cluster->priv->node_name);
 660     } else {
 661         // Ask for the agreed best CIB
 662         sync_from = pcmk__str_copy(max_generation_from);
 663     }
 664     crm_notice("Finalizing join-%d for %d node%s (sync'ing CIB %s.%s.%s "
 665                "with schema %s and feature set %s from %s)",
 666                current_join_id, count_finalizable,
 667                pcmk__plural_s(count_finalizable),
 668                crm_element_value(max_generation_xml, PCMK_XA_ADMIN_EPOCH),
 669                crm_element_value(max_generation_xml, PCMK_XA_EPOCH),
 670                crm_element_value(max_generation_xml, PCMK_XA_NUM_UPDATES),
 671                crm_element_value(max_generation_xml, PCMK_XA_VALIDATE_WITH),
 672                crm_element_value(max_generation_xml, PCMK_XA_CRM_FEATURE_SET),
 673                sync_from);
 674     crmd_join_phase_log(LOG_DEBUG);
 675 
 676     rc = controld_globals.cib_conn->cmds->sync_from(controld_globals.cib_conn,
 677                                                     sync_from, NULL, cib_none);
 678     fsa_register_cib_callback(rc, sync_from, finalize_sync_callback);
 679 }
 680 
 681 void
 682 free_max_generation(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 683 {
 684     free(max_generation_from);
 685     max_generation_from = NULL;
 686 
 687     pcmk__xml_free(max_generation_xml);
 688     max_generation_xml = NULL;
 689 }
 690 
 691 void
 692 finalize_sync_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 693 {
 694     CRM_LOG_ASSERT(-EPERM != rc);
 695 
 696     if (rc != pcmk_ok) {
 697         const char *sync_from = (const char *) user_data;
 698 
 699         do_crm_log(((rc == -pcmk_err_old_data)? LOG_WARNING : LOG_ERR),
 700                    "Could not sync CIB from %s in join-%d: %s",
 701                    sync_from, current_join_id, pcmk_strerror(rc));
 702 
 703         if (rc != -pcmk_err_old_data) {
 704             record_failed_sync_node(sync_from, current_join_id);
 705         }
 706 
 707         /* restart the whole join process */
 708         register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION_DC, NULL, NULL,
 709                                __func__);
 710 
 711     } else if (!AM_I_DC) {
 712         crm_debug("Sync'ed CIB for join-%d but no longer DC", current_join_id);
 713 
 714     } else if (controld_globals.fsa_state != S_FINALIZE_JOIN) {
 715         crm_debug("Sync'ed CIB for join-%d but no longer in S_FINALIZE_JOIN "
 716                   "(%s)", current_join_id,
 717                   fsa_state2string(controld_globals.fsa_state));
 718 
 719     } else {
 720         controld_set_fsa_input_flags(R_HAVE_CIB);
 721 
 722         /* make sure dc_uuid is re-set to us */
 723         if (!check_join_state(controld_globals.fsa_state, __func__)) {
 724             int count_finalizable = 0;
 725 
 726             count_finalizable = crmd_join_phase_count(controld_join_integrated)
 727                                 + crmd_join_phase_count(controld_join_nack);
 728 
 729             crm_debug("Notifying %d node%s of join-%d results",
 730                       count_finalizable, pcmk__plural_s(count_finalizable),
 731                       current_join_id);
 732             g_hash_table_foreach(pcmk__peer_cache, finalize_join_for, NULL);
 733         }
 734     }
 735 }
 736 
 737 static void
 738 join_node_state_commit_callback(xmlNode *msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 739                                 xmlNode *output, void *user_data)
 740 {
 741     const char *node = user_data;
 742 
 743     if (rc != pcmk_ok) {
 744         fsa_data_t *msg_data = NULL;    // for register_fsa_error() macro
 745 
 746         crm_crit("join-%d node history update (via CIB call %d) for node %s "
 747                  "failed: %s",
 748                  current_join_id, call_id, node, pcmk_strerror(rc));
 749         crm_log_xml_debug(msg, "failed");
 750         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 751     }
 752 
 753     crm_debug("join-%d node history update (via CIB call %d) for node %s "
 754               "complete",
 755               current_join_id, call_id, node);
 756     check_join_state(controld_globals.fsa_state, __func__);
 757 }
 758 
 759 /*      A_DC_JOIN_PROCESS_ACK   */
 760 void
 761 do_dc_join_ack(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 762                enum crmd_fsa_cause cause,
 763                enum crmd_fsa_state cur_state,
 764                enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 765 {
 766     int join_id = -1;
 767     ha_msg_input_t *join_ack = fsa_typed_data(fsa_dt_ha_msg);
 768 
 769     const char *op = crm_element_value(join_ack->msg, PCMK__XA_CRM_TASK);
 770     char *join_from = crm_element_value_copy(join_ack->msg, PCMK__XA_SRC);
 771     pcmk__node_status_t *peer = NULL;
 772     enum controld_join_phase phase = controld_join_none;
 773 
 774     enum controld_section_e section = controld_section_lrm;
 775     char *xpath = NULL;
 776     xmlNode *state = join_ack->xml;
 777     xmlNode *execd_state = NULL;
 778 
 779     cib_t *cib = controld_globals.cib_conn;
 780     int rc = pcmk_ok;
 781 
 782     // Sanity checks
 783     if (join_from == NULL) {
 784         crm_warn("Ignoring message received without node identification");
 785         goto done;
 786     }
 787     if (op == NULL) {
 788         crm_warn("Ignoring message received from %s without task", join_from);
 789         goto done;
 790     }
 791 
 792     if (strcmp(op, CRM_OP_JOIN_CONFIRM)) {
 793         crm_debug("Ignoring '%s' message from %s while waiting for '%s'",
 794                   op, join_from, CRM_OP_JOIN_CONFIRM);
 795         goto done;
 796     }
 797 
 798     if (crm_element_value_int(join_ack->msg, PCMK__XA_JOIN_ID, &join_id) != 0) {
 799         crm_warn("Ignoring join confirmation from %s without valid join ID",
 800                  join_from);
 801         goto done;
 802     }
 803 
 804     peer = pcmk__get_node(0, join_from, NULL, pcmk__node_search_cluster_member);
 805     phase = controld_get_join_phase(peer);
 806     if (phase != controld_join_finalized) {
 807         crm_info("Ignoring out-of-sequence join-%d confirmation from %s "
 808                  "(currently %s not %s)",
 809                  join_id, join_from, join_phase_text(phase),
 810                  join_phase_text(controld_join_finalized));
 811         goto done;
 812     }
 813 
 814     if (join_id != current_join_id) {
 815         crm_err("Rejecting join-%d confirmation from %s "
 816                 "because currently on join-%d",
 817                 join_id, join_from, current_join_id);
 818         crm_update_peer_join(__func__, peer, controld_join_nack);
 819         goto done;
 820     }
 821 
 822     crm_update_peer_join(__func__, peer, controld_join_confirmed);
 823 
 824     /* Update CIB with node's current executor state. A new transition will be
 825      * triggered later, when the CIB manager notifies us of the change.
 826      *
 827      * The delete and modify requests are part of an atomic transaction.
 828      */
 829     rc = cib->cmds->init_transaction(cib);
 830     if (rc != pcmk_ok) {
 831         goto done;
 832     }
 833 
 834     // Delete relevant parts of node's current executor state from CIB
 835     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 836         section = controld_section_lrm_unlocked;
 837     }
 838     controld_node_state_deletion_strings(join_from, section, &xpath, NULL);
 839 
 840     rc = cib->cmds->remove(cib, xpath, NULL,
 841                            cib_xpath|cib_multiple|cib_transaction);
 842     if (rc != pcmk_ok) {
 843         goto done;
 844     }
 845 
 846     // Update CIB with node's latest known executor state
 847     if (controld_is_local_node(join_from)) {
 848 
 849         // Use the latest possible state if processing our own join ack
 850         execd_state = controld_query_executor_state();
 851 
 852         if (execd_state != NULL) {
 853             crm_debug("Updating local node history for join-%d from query "
 854                       "result",
 855                       current_join_id);
 856             state = execd_state;
 857 
 858         } else {
 859             crm_warn("Updating local node history from join-%d confirmation "
 860                      "because query failed",
 861                      current_join_id);
 862         }
 863 
 864     } else {
 865         crm_debug("Updating node history for %s from join-%d confirmation",
 866                   join_from, current_join_id);
 867     }
 868 
 869     rc = cib->cmds->modify(cib, PCMK_XE_STATUS, state,
 870                            cib_can_create|cib_transaction);
 871     pcmk__xml_free(execd_state);
 872     if (rc != pcmk_ok) {
 873         goto done;
 874     }
 875 
 876     // Commit the transaction
 877     rc = cib->cmds->end_transaction(cib, true, cib_none);
 878     fsa_register_cib_callback(rc, join_from, join_node_state_commit_callback);
 879 
 880     if (rc > 0) {
 881         // join_from will be freed after callback
 882         join_from = NULL;
 883         rc = pcmk_ok;
 884     }
 885 
 886 done:
 887     if (rc != pcmk_ok) {
 888         crm_crit("join-%d node history update for node %s failed: %s",
 889                  current_join_id, join_from, pcmk_strerror(rc));
 890         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 891     }
 892     free(join_from);
 893     free(xpath);
 894 }
 895 
 896 void
 897 finalize_join_for(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 898 {
 899     xmlNode *acknak = NULL;
 900     xmlNode *tmp1 = NULL;
 901     pcmk__node_status_t *join_node = value;
 902     const char *join_to = join_node->name;
 903     enum controld_join_phase phase = controld_get_join_phase(join_node);
 904     bool integrated = false;
 905 
 906     switch (phase) {
 907         case controld_join_integrated:
 908             integrated = true;
 909             break;
 910         case controld_join_nack:
 911             break;
 912         default:
 913             crm_trace("Not updating non-integrated and non-nacked node %s (%s) "
 914                       "for join-%d",
 915                       join_to, join_phase_text(phase), current_join_id);
 916             return;
 917     }
 918 
 919     /* Update the <node> element with the node's name and UUID, in case they
 920      * weren't known before
 921      */
 922     crm_trace("Updating node name and UUID in CIB for %s", join_to);
 923     tmp1 = pcmk__xe_create(NULL, PCMK_XE_NODE);
 924     crm_xml_add(tmp1, PCMK_XA_ID, pcmk__cluster_get_xml_id(join_node));
 925     crm_xml_add(tmp1, PCMK_XA_UNAME, join_to);
 926     fsa_cib_anon_update(PCMK_XE_NODES, tmp1);
 927     pcmk__xml_free(tmp1);
 928 
 929     join_node = pcmk__get_node(0, join_to, NULL,
 930                                pcmk__node_search_cluster_member);
 931     if (!pcmk__cluster_is_node_active(join_node)) {
 932         /*
 933          * NACK'ing nodes that the membership layer doesn't know about yet
 934          * simply creates more churn
 935          *
 936          * Better to leave them waiting and let the join restart when
 937          * the new membership event comes in
 938          *
 939          * All other NACKs (due to versions etc) should still be processed
 940          */
 941         pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_PENDING);
 942         return;
 943     }
 944 
 945     // Acknowledge or nack node's join request
 946     crm_debug("%sing join-%d request from %s",
 947               integrated? "Acknowledg" : "Nack", current_join_id, join_to);
 948     acknak = create_dc_message(CRM_OP_JOIN_ACKNAK, join_to);
 949     pcmk__xe_set_bool_attr(acknak, CRM_OP_JOIN_ACKNAK, integrated);
 950 
 951     if (integrated) {
 952         // No change needed for a nacked node
 953         crm_update_peer_join(__func__, join_node, controld_join_finalized);
 954         pcmk__update_peer_expected(__func__, join_node, CRMD_JOINSTATE_MEMBER);
 955 
 956         /* Iterate through the remote peer cache and add information on which
 957          * node hosts each to the ACK message.  This keeps new controllers in
 958          * sync with what has already happened.
 959          */
 960         if (pcmk__cluster_num_remote_nodes() > 0) {
 961             GHashTableIter iter;
 962             pcmk__node_status_t *node = NULL;
 963             xmlNode *remotes = pcmk__xe_create(acknak, PCMK_XE_NODES);
 964 
 965             g_hash_table_iter_init(&iter, pcmk__remote_peer_cache);
 966             while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 967                 xmlNode *remote = NULL;
 968 
 969                 if (!node->conn_host) {
 970                     continue;
 971                 }
 972 
 973                 remote = pcmk__xe_create(remotes, PCMK_XE_NODE);
 974                 pcmk__xe_set_props(remote,
 975                                    PCMK_XA_ID, node->name,
 976                                    PCMK__XA_NODE_STATE, node->state,
 977                                    PCMK__XA_CONNECTION_HOST, node->conn_host,
 978                                    NULL);
 979             }
 980         }
 981     }
 982     pcmk__cluster_send_message(join_node, pcmk_ipc_controld, acknak);
 983     pcmk__xml_free(acknak);
 984     return;
 985 }
 986 
 987 gboolean
 988 check_join_state(enum crmd_fsa_state cur_state, const char *source)
     /* [previous][next][first][last][top][bottom][index][help] */
 989 {
 990     static unsigned long long highest_seq = 0;
 991 
 992     if (controld_globals.membership_id != controld_globals.peer_seq) {
 993         crm_debug("join-%d: Membership changed from %llu to %llu "
 994                   QB_XS " highest=%llu state=%s for=%s",
 995                   current_join_id, controld_globals.membership_id,
 996                   controld_globals.peer_seq, highest_seq,
 997                   fsa_state2string(cur_state), source);
 998         if (highest_seq < controld_globals.peer_seq) {
 999             /* Don't spam the FSA with duplicates */
1000             highest_seq = controld_globals.peer_seq;
1001             register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
1002         }
1003 
1004     } else if (cur_state == S_INTEGRATION) {
1005         if (crmd_join_phase_count(controld_join_welcomed) == 0) {
1006             int count = crmd_join_phase_count(controld_join_integrated);
1007 
1008             crm_debug("join-%d: Integration of %d peer%s complete "
1009                       QB_XS " state=%s for=%s",
1010                       current_join_id, count, pcmk__plural_s(count),
1011                       fsa_state2string(cur_state), source);
1012             register_fsa_input_before(C_FSA_INTERNAL, I_INTEGRATED, NULL);
1013             return TRUE;
1014         }
1015 
1016     } else if (cur_state == S_FINALIZE_JOIN) {
1017         if (!pcmk_is_set(controld_globals.fsa_input_register, R_HAVE_CIB)) {
1018             crm_debug("join-%d: Delaying finalization until we have CIB "
1019                       QB_XS " state=%s for=%s",
1020                       current_join_id, fsa_state2string(cur_state), source);
1021             return TRUE;
1022 
1023         } else if (crmd_join_phase_count(controld_join_welcomed) != 0) {
1024             int count = crmd_join_phase_count(controld_join_welcomed);
1025 
1026             crm_debug("join-%d: Still waiting on %d welcomed node%s "
1027                       QB_XS " state=%s for=%s",
1028                       current_join_id, count, pcmk__plural_s(count),
1029                       fsa_state2string(cur_state), source);
1030             crmd_join_phase_log(LOG_DEBUG);
1031 
1032         } else if (crmd_join_phase_count(controld_join_integrated) != 0) {
1033             int count = crmd_join_phase_count(controld_join_integrated);
1034 
1035             crm_debug("join-%d: Still waiting on %d integrated node%s "
1036                       QB_XS " state=%s for=%s",
1037                       current_join_id, count, pcmk__plural_s(count),
1038                       fsa_state2string(cur_state), source);
1039             crmd_join_phase_log(LOG_DEBUG);
1040 
1041         } else if (crmd_join_phase_count(controld_join_finalized) != 0) {
1042             int count = crmd_join_phase_count(controld_join_finalized);
1043 
1044             crm_debug("join-%d: Still waiting on %d finalized node%s "
1045                       QB_XS " state=%s for=%s",
1046                       current_join_id, count, pcmk__plural_s(count),
1047                       fsa_state2string(cur_state), source);
1048             crmd_join_phase_log(LOG_DEBUG);
1049 
1050         } else {
1051             crm_debug("join-%d: Complete " QB_XS " state=%s for=%s",
1052                       current_join_id, fsa_state2string(cur_state), source);
1053             register_fsa_input_later(C_FSA_INTERNAL, I_FINALIZED, NULL);
1054             return TRUE;
1055         }
1056     }
1057 
1058     return FALSE;
1059 }
1060 
1061 void
1062 do_dc_join_final(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
1063                  enum crmd_fsa_cause cause,
1064                  enum crmd_fsa_state cur_state,
1065                  enum crmd_fsa_input current_input, fsa_data_t * msg_data)
1066 {
1067     crm_debug("Ensuring DC, quorum and node attributes are up-to-date");
1068     crm_update_quorum(pcmk__cluster_has_quorum(), TRUE);
1069 }
1070 
1071 int crmd_join_phase_count(enum controld_join_phase phase)
     /* [previous][next][first][last][top][bottom][index][help] */
1072 {
1073     int count = 0;
1074     pcmk__node_status_t *peer;
1075     GHashTableIter iter;
1076 
1077     g_hash_table_iter_init(&iter, pcmk__peer_cache);
1078     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) {
1079         if (controld_get_join_phase(peer) == phase) {
1080             count++;
1081         }
1082     }
1083     return count;
1084 }
1085 
1086 void crmd_join_phase_log(int level)
     /* [previous][next][first][last][top][bottom][index][help] */
1087 {
1088     pcmk__node_status_t *peer;
1089     GHashTableIter iter;
1090 
1091     g_hash_table_iter_init(&iter, pcmk__peer_cache);
1092     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &peer)) {
1093         do_crm_log(level, "join-%d: %s=%s", current_join_id, peer->name,
1094                    join_phase_text(controld_get_join_phase(peer)));
1095     }
1096 }

/* [previous][next][first][last][top][bottom][index][help] */