root/daemons/controld/controld_membership.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. reap_dead_nodes
  2. post_cache_update
  3. crmd_node_update_complete
  4. create_node_state_update
  5. remove_conflicting_node_callback
  6. search_conflicting_node_callback
  7. node_list_update_callback
  8. populate_cib_nodes
  9. cib_quorum_update_complete
  10. crm_update_quorum

   1 /*
   2  * Copyright 2004-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 /* put these first so that uuid_t is defined without conflicts */
  11 #include <crm_internal.h>
  12 
  13 #include <string.h>
  14 
  15 #include <crm/crm.h>
  16 #include <crm/common/xml.h>
  17 #include <crm/common/xml_internal.h>
  18 #include <crm/cluster/internal.h>
  19 
  20 #include <pacemaker-controld.h>
  21 
  22 void post_cache_update(int instance);
  23 
  24 extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
  25 
  26 static void
  27 reap_dead_nodes(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  28 {
  29     crm_node_t *node = value;
  30 
  31     if (!pcmk__cluster_is_node_active(node)) {
  32         crm_update_peer_join(__func__, node, crm_join_none);
  33 
  34         if(node && node->uname) {
  35             if (pcmk__str_eq(controld_globals.our_nodename, node->uname,
  36                              pcmk__str_casei)) {
  37                 crm_err("We're not part of the cluster anymore");
  38                 register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
  39 
  40             } else if (!AM_I_DC
  41                        && pcmk__str_eq(node->uname, controld_globals.dc_name,
  42                                        pcmk__str_casei)) {
  43                 crm_warn("Our DC node (%s) left the cluster", node->uname);
  44                 register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
  45             }
  46         }
  47 
  48         if ((controld_globals.fsa_state == S_INTEGRATION)
  49             || (controld_globals.fsa_state == S_FINALIZE_JOIN)) {
  50             check_join_state(controld_globals.fsa_state, __func__);
  51         }
  52         if ((node != NULL) && (node->uuid != NULL)) {
  53             fail_incompletable_actions(controld_globals.transition_graph,
  54                                        node->uuid);
  55         }
  56     }
  57 }
  58 
  59 void
  60 post_cache_update(int instance)
     /* [previous][next][first][last][top][bottom][index][help] */
  61 {
  62     xmlNode *no_op = NULL;
  63 
  64     crm_peer_seq = instance;
  65     crm_debug("Updated cache after membership event %d.", instance);
  66 
  67     g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL);
  68     controld_set_fsa_input_flags(R_MEMBERSHIP);
  69 
  70     if (AM_I_DC) {
  71         populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer |
  72                            node_update_expected, __func__);
  73     }
  74 
  75     /*
  76      * If we lost nodes, we should re-check the election status
  77      * Safe to call outside of an election
  78      */
  79     controld_set_fsa_action_flags(A_ELECTION_CHECK);
  80     controld_trigger_fsa();
  81 
  82     /* Membership changed, remind everyone we're here.
  83      * This will aid detection of duplicate DCs
  84      */
  85     no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD,
  86                            AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL);
  87     pcmk__cluster_send_message(NULL, crm_msg_crmd, no_op);
  88     free_xml(no_op);
  89 }
  90 
  91 static void
  92 crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  93 {
  94     fsa_data_t *msg_data = NULL;
  95 
  96     if (rc == pcmk_ok) {
  97         crm_trace("Node update %d complete", call_id);
  98 
  99     } else if(call_id < pcmk_ok) {
 100         crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 101         crm_log_xml_debug(msg, "failed");
 102         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 103 
 104     } else {
 105         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 106         crm_log_xml_debug(msg, "failed");
 107         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 108     }
 109 }
 110 
 111 /*!
 112  * \internal
 113  * \brief Create an XML node state tag with updates
 114  *
 115  * \param[in,out] node    Node whose state will be used for update
 116  * \param[in]     flags   Bitmask of node_update_flags indicating what to update
 117  * \param[in,out] parent  XML node to contain update (or NULL)
 118  * \param[in]     source  Who requested the update (only used for logging)
 119  *
 120  * \return Pointer to created node state tag
 121  */
 122 xmlNode *
 123 create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
     /* [previous][next][first][last][top][bottom][index][help] */
 124                          const char *source)
 125 {
 126     const char *value = NULL;
 127     xmlNode *node_state;
 128 
 129     if (!node->state) {
 130         crm_info("Node update for %s cancelled: no state, not seen yet", node->uname);
 131        return NULL;
 132     }
 133 
 134     node_state = pcmk__xe_create(parent, PCMK__XE_NODE_STATE);
 135 
 136     if (pcmk_is_set(node->flags, crm_remote_node)) {
 137         pcmk__xe_set_bool_attr(node_state, PCMK_XA_REMOTE_NODE, true);
 138     }
 139 
 140     if (crm_xml_add(node_state, PCMK_XA_ID,
 141                     pcmk__cluster_node_uuid(node)) == NULL) {
 142         crm_info("Node update for %s cancelled: no ID", node->uname);
 143         free_xml(node_state);
 144         return NULL;
 145     }
 146 
 147     crm_xml_add(node_state, PCMK_XA_UNAME, node->uname);
 148 
 149     if ((flags & node_update_cluster) && node->state) {
 150         if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) {
 151             // A value 0 means the node is not a cluster member.
 152             crm_xml_add_ll(node_state, PCMK__XA_IN_CCM, node->when_member);
 153 
 154         } else {
 155             pcmk__xe_set_bool_attr(node_state, PCMK__XA_IN_CCM,
 156                                    pcmk__str_eq(node->state, CRM_NODE_MEMBER,
 157                                                 pcmk__str_casei));
 158         }
 159     }
 160 
 161     if (!pcmk_is_set(node->flags, crm_remote_node)) {
 162         if (flags & node_update_peer) {
 163             if (compare_version(controld_globals.dc_version, "3.18.0") >= 0) {
 164                 // A value 0 means the peer is offline in CPG.
 165                 crm_xml_add_ll(node_state, PCMK_XA_CRMD, node->when_online);
 166 
 167             } else {
 168                 // @COMPAT DCs < 2.1.7 use online/offline rather than timestamp
 169                 value = PCMK_VALUE_OFFLINE;
 170                 if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
 171                     value = PCMK_VALUE_ONLINE;
 172                 }
 173                 crm_xml_add(node_state, PCMK_XA_CRMD, value);
 174             }
 175         }
 176 
 177         if (flags & node_update_join) {
 178             if (node->join <= crm_join_none) {
 179                 value = CRMD_JOINSTATE_DOWN;
 180             } else {
 181                 value = CRMD_JOINSTATE_MEMBER;
 182             }
 183             crm_xml_add(node_state, PCMK__XA_JOIN, value);
 184         }
 185 
 186         if (flags & node_update_expected) {
 187             crm_xml_add(node_state, PCMK_XA_EXPECTED, node->expected);
 188         }
 189     }
 190 
 191     crm_xml_add(node_state, PCMK_XA_CRM_DEBUG_ORIGIN, source);
 192 
 193     return node_state;
 194 }
 195 
 196 static void
 197 remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 198                                  xmlNode * output, void *user_data)
 199 {
 200     char *node_uuid = user_data;
 201 
 202     do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE,
 203                         "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)",
 204                         node_uuid, pcmk_strerror(rc), rc);
 205 }
 206 
 207 static void
 208 search_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 209                                  xmlNode * output, void *user_data)
 210 {
 211     char *new_node_uuid = user_data;
 212     xmlNode *node_xml = NULL;
 213 
 214     if (rc != pcmk_ok) {
 215         if (rc != -ENXIO) {
 216             crm_notice("Searching conflicting nodes for %s failed: %s (%d)",
 217                        new_node_uuid, pcmk_strerror(rc), rc);
 218         }
 219         return;
 220 
 221     } else if (output == NULL) {
 222         return;
 223     }
 224 
 225     if (pcmk__xe_is(output, PCMK_XE_NODE)) {
 226         node_xml = output;
 227 
 228     } else {
 229         node_xml = pcmk__xe_first_child(output, PCMK_XE_NODE, NULL, NULL);
 230     }
 231 
 232     for (; node_xml != NULL; node_xml = pcmk__xe_next_same(node_xml)) {
 233         const char *node_uuid = NULL;
 234         const char *node_uname = NULL;
 235         GHashTableIter iter;
 236         crm_node_t *node = NULL;
 237         gboolean known = FALSE;
 238 
 239         node_uuid = crm_element_value(node_xml, PCMK_XA_ID);
 240         node_uname = crm_element_value(node_xml, PCMK_XA_UNAME);
 241 
 242         if (node_uuid == NULL || node_uname == NULL) {
 243             continue;
 244         }
 245 
 246         g_hash_table_iter_init(&iter, crm_peer_cache);
 247         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 248             if (node->uuid
 249                 && pcmk__str_eq(node->uuid, node_uuid, pcmk__str_casei)
 250                 && node->uname
 251                 && pcmk__str_eq(node->uname, node_uname, pcmk__str_casei)) {
 252 
 253                 known = TRUE;
 254                 break;
 255             }
 256         }
 257 
 258         if (known == FALSE) {
 259             cib_t *cib_conn = controld_globals.cib_conn;
 260             int delete_call_id = 0;
 261             xmlNode *node_state_xml = NULL;
 262 
 263             crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s",
 264                        node_uuid, node_uname, new_node_uuid);
 265 
 266             delete_call_id = cib_conn->cmds->remove(cib_conn, PCMK_XE_NODES,
 267                                                     node_xml, cib_scope_local);
 268             fsa_register_cib_callback(delete_call_id, pcmk__str_copy(node_uuid),
 269                                       remove_conflicting_node_callback);
 270 
 271             node_state_xml = pcmk__xe_create(NULL, PCMK__XE_NODE_STATE);
 272             crm_xml_add(node_state_xml, PCMK_XA_ID, node_uuid);
 273             crm_xml_add(node_state_xml, PCMK_XA_UNAME, node_uname);
 274 
 275             delete_call_id = cib_conn->cmds->remove(cib_conn, PCMK_XE_STATUS,
 276                                                     node_state_xml,
 277                                                     cib_scope_local);
 278             fsa_register_cib_callback(delete_call_id, pcmk__str_copy(node_uuid),
 279                                       remove_conflicting_node_callback);
 280             free_xml(node_state_xml);
 281         }
 282     }
 283 }
 284 
 285 static void
 286 node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 287 {
 288     fsa_data_t *msg_data = NULL;
 289 
 290     if(call_id < pcmk_ok) {
 291         crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 292         crm_log_xml_debug(msg, "update:failed");
 293         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 294 
 295     } else if(rc < pcmk_ok) {
 296         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 297         crm_log_xml_debug(msg, "update:failed");
 298         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 299     }
 300 }
 301 
 302 void
 303 populate_cib_nodes(enum node_update_flags flags, const char *source)
     /* [previous][next][first][last][top][bottom][index][help] */
 304 {
 305     cib_t *cib_conn = controld_globals.cib_conn;
 306 
 307     int call_id = 0;
 308     gboolean from_hashtable = TRUE;
 309     xmlNode *node_list = pcmk__xe_create(NULL, PCMK_XE_NODES);
 310 
 311 #if SUPPORT_COROSYNC
 312     if (!pcmk_is_set(flags, node_update_quick)
 313         && (pcmk_get_cluster_layer() == pcmk_cluster_layer_corosync)) {
 314 
 315         from_hashtable = pcmk__corosync_add_nodes(node_list);
 316     }
 317 #endif
 318 
 319     if (from_hashtable) {
 320         GHashTableIter iter;
 321         crm_node_t *node = NULL;
 322         GString *xpath = NULL;
 323 
 324         g_hash_table_iter_init(&iter, crm_peer_cache);
 325         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 326             xmlNode *new_node = NULL;
 327 
 328             if ((node->uuid != NULL) && (node->uname != NULL)) {
 329                 crm_trace("Creating node entry for %s/%s", node->uname, node->uuid);
 330                 if (xpath == NULL) {
 331                     xpath = g_string_sized_new(512);
 332                 } else {
 333                     g_string_truncate(xpath, 0);
 334                 }
 335 
 336                 /* We need both to be valid */
 337                 new_node = pcmk__xe_create(node_list, PCMK_XE_NODE);
 338                 crm_xml_add(new_node, PCMK_XA_ID, node->uuid);
 339                 crm_xml_add(new_node, PCMK_XA_UNAME, node->uname);
 340 
 341                 /* Search and remove unknown nodes with the conflicting uname from CIB */
 342                 pcmk__g_strcat(xpath,
 343                                "/" PCMK_XE_CIB "/" PCMK_XE_CONFIGURATION
 344                                "/" PCMK_XE_NODES "/" PCMK_XE_NODE
 345                                "[@" PCMK_XA_UNAME "='", node->uname, "']"
 346                                "[@" PCMK_XA_ID "!='", node->uuid, "']", NULL);
 347 
 348                 call_id = cib_conn->cmds->query(cib_conn,
 349                                                 (const char *) xpath->str,
 350                                                 NULL,
 351                                                 cib_scope_local|cib_xpath);
 352                 fsa_register_cib_callback(call_id, pcmk__str_copy(node->uuid),
 353                                           search_conflicting_node_callback);
 354             }
 355         }
 356 
 357         if (xpath != NULL) {
 358             g_string_free(xpath, TRUE);
 359         }
 360     }
 361 
 362     crm_trace("Populating <nodes> section from %s", from_hashtable ? "hashtable" : "cluster");
 363 
 364     if ((controld_update_cib(PCMK_XE_NODES, node_list, cib_scope_local,
 365                              node_list_update_callback) == pcmk_rc_ok)
 366          && (crm_peer_cache != NULL) && AM_I_DC) {
 367         /*
 368          * There is no need to update the local CIB with our values if
 369          * we've not seen valid membership data
 370          */
 371         GHashTableIter iter;
 372         crm_node_t *node = NULL;
 373 
 374         free_xml(node_list);
 375         node_list = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 376 
 377         g_hash_table_iter_init(&iter, crm_peer_cache);
 378         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 379             create_node_state_update(node, flags, node_list, source);
 380         }
 381 
 382         if (crm_remote_peer_cache) {
 383             g_hash_table_iter_init(&iter, crm_remote_peer_cache);
 384             while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 385                 create_node_state_update(node, flags, node_list, source);
 386             }
 387         }
 388 
 389         controld_update_cib(PCMK_XE_STATUS, node_list, cib_scope_local,
 390                             crmd_node_update_complete);
 391     }
 392     free_xml(node_list);
 393 }
 394 
 395 static void
 396 cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 397 {
 398     fsa_data_t *msg_data = NULL;
 399 
 400     if (rc == pcmk_ok) {
 401         crm_trace("Quorum update %d complete", call_id);
 402 
 403     } else {
 404         crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 405         crm_log_xml_debug(msg, "failed");
 406         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 407     }
 408 }
 409 
 410 void
 411 crm_update_quorum(gboolean quorum, gboolean force_update)
     /* [previous][next][first][last][top][bottom][index][help] */
 412 {
 413     bool has_quorum = pcmk_is_set(controld_globals.flags, controld_has_quorum);
 414 
 415     if (quorum) {
 416         controld_set_global_flags(controld_ever_had_quorum);
 417 
 418     } else if (pcmk_all_flags_set(controld_globals.flags,
 419                                   controld_ever_had_quorum
 420                                   |controld_no_quorum_panic)) {
 421         pcmk__panic(__func__);
 422     }
 423 
 424     if (AM_I_DC
 425         && ((has_quorum && !quorum) || (!has_quorum && quorum)
 426             || force_update)) {
 427         xmlNode *update = NULL;
 428 
 429         update = pcmk__xe_create(NULL, PCMK_XE_CIB);
 430         crm_xml_add_int(update, PCMK_XA_HAVE_QUORUM, quorum);
 431         crm_xml_add(update, PCMK_XA_DC_UUID, controld_globals.our_uuid);
 432 
 433         crm_debug("Updating quorum status to %s", pcmk__btoa(quorum));
 434         controld_update_cib(PCMK_XE_CIB, update, cib_scope_local,
 435                             cib_quorum_update_complete);
 436         free_xml(update);
 437 
 438         /* Quorum changes usually cause a new transition via other activity:
 439          * quorum gained via a node joining will abort via the node join,
 440          * and quorum lost via a node leaving will usually abort via resource
 441          * activity and/or fencing.
 442          *
 443          * However, it is possible that nothing else causes a transition (e.g.
 444          * someone forces quorum via corosync-cmaptcl, or quorum is lost due to
 445          * a node in standby shutting down cleanly), so here ensure a new
 446          * transition is triggered.
 447          */
 448         if (quorum) {
 449             /* If quorum was gained, abort after a short delay, in case multiple
 450              * nodes are joining around the same time, so the one that brings us
 451              * to quorum doesn't cause all the remaining ones to be fenced.
 452              */
 453             abort_after_delay(PCMK_SCORE_INFINITY, pcmk__graph_restart,
 454                               "Quorum gained", 5000);
 455         } else {
 456             abort_transition(PCMK_SCORE_INFINITY, pcmk__graph_restart,
 457                              "Quorum lost", NULL);
 458         }
 459     }
 460 
 461     if (quorum) {
 462         controld_set_global_flags(controld_has_quorum);
 463     } else {
 464         controld_clear_global_flags(controld_has_quorum);
 465     }
 466 }

/* [previous][next][first][last][top][bottom][index][help] */