root/daemons/controld/controld_membership.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. reap_dead_nodes
  2. post_cache_update
  3. crmd_node_update_complete
  4. create_node_state_update
  5. remove_conflicting_node_callback
  6. search_conflicting_node_callback
  7. node_list_update_callback
  8. populate_cib_nodes
  9. cib_quorum_update_complete
  10. crm_update_quorum

   1 /*
   2  * Copyright 2004-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 /* put these first so that uuid_t is defined without conflicts */
  11 #include <crm_internal.h>
  12 
  13 #include <string.h>
  14 
  15 #include <crm/crm.h>
  16 #include <crm/msg_xml.h>
  17 #include <crm/common/xml.h>
  18 #include <crm/common/xml_internal.h>
  19 #include <crm/cluster/internal.h>
  20 
  21 #include <pacemaker-controld.h>
  22 
  23 void post_cache_update(int instance);
  24 
  25 extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
  26 
  27 static void
  28 reap_dead_nodes(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  29 {
  30     crm_node_t *node = value;
  31 
  32     if (crm_is_peer_active(node) == FALSE) {
  33         crm_update_peer_join(__func__, node, crm_join_none);
  34 
  35         if(node && node->uname) {
  36             if (pcmk__str_eq(controld_globals.our_nodename, node->uname,
  37                              pcmk__str_casei)) {
  38                 crm_err("We're not part of the cluster anymore");
  39                 register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
  40 
  41             } else if (!AM_I_DC
  42                        && pcmk__str_eq(node->uname, controld_globals.dc_name,
  43                                        pcmk__str_casei)) {
  44                 crm_warn("Our DC node (%s) left the cluster", node->uname);
  45                 register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
  46             }
  47         }
  48 
  49         if ((controld_globals.fsa_state == S_INTEGRATION)
  50             || (controld_globals.fsa_state == S_FINALIZE_JOIN)) {
  51             check_join_state(controld_globals.fsa_state, __func__);
  52         }
  53         if ((node != NULL) && (node->uuid != NULL)) {
  54             fail_incompletable_actions(controld_globals.transition_graph,
  55                                        node->uuid);
  56         }
  57     }
  58 }
  59 
  60 void
  61 post_cache_update(int instance)
     /* [previous][next][first][last][top][bottom][index][help] */
  62 {
  63     xmlNode *no_op = NULL;
  64 
  65     crm_peer_seq = instance;
  66     crm_debug("Updated cache after membership event %d.", instance);
  67 
  68     g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL);
  69     controld_set_fsa_input_flags(R_MEMBERSHIP);
  70 
  71     if (AM_I_DC) {
  72         populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer |
  73                            node_update_expected, __func__);
  74     }
  75 
  76     /*
  77      * If we lost nodes, we should re-check the election status
  78      * Safe to call outside of an election
  79      */
  80     controld_set_fsa_action_flags(A_ELECTION_CHECK);
  81     controld_trigger_fsa();
  82 
  83     /* Membership changed, remind everyone we're here.
  84      * This will aid detection of duplicate DCs
  85      */
  86     no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD,
  87                            AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL);
  88     send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE);
  89     free_xml(no_op);
  90 }
  91 
  92 static void
  93 crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  94 {
  95     fsa_data_t *msg_data = NULL;
  96 
  97     if (rc == pcmk_ok) {
  98         crm_trace("Node update %d complete", call_id);
  99 
 100     } else if(call_id < pcmk_ok) {
 101         crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 102         crm_log_xml_debug(msg, "failed");
 103         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 104 
 105     } else {
 106         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 107         crm_log_xml_debug(msg, "failed");
 108         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 109     }
 110 }
 111 
 112 /*!
 113  * \internal
 114  * \brief Create an XML node state tag with updates
 115  *
 116  * \param[in,out] node    Node whose state will be used for update
 117  * \param[in]     flags   Bitmask of node_update_flags indicating what to update
 118  * \param[in,out] parent  XML node to contain update (or NULL)
 119  * \param[in]     source  Who requested the update (only used for logging)
 120  *
 121  * \return Pointer to created node state tag
 122  */
 123 xmlNode *
 124 create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
     /* [previous][next][first][last][top][bottom][index][help] */
 125                          const char *source)
 126 {
 127     const char *value = NULL;
 128     xmlNode *node_state;
 129 
 130     if (!node->state) {
 131         crm_info("Node update for %s cancelled: no state, not seen yet", node->uname);
 132        return NULL;
 133     }
 134 
 135     node_state = create_xml_node(parent, XML_CIB_TAG_STATE);
 136 
 137     if (pcmk_is_set(node->flags, crm_remote_node)) {
 138         pcmk__xe_set_bool_attr(node_state, XML_NODE_IS_REMOTE, true);
 139     }
 140 
 141     set_uuid(node_state, XML_ATTR_ID, node);
 142 
 143     if (crm_element_value(node_state, XML_ATTR_ID) == NULL) {
 144         crm_info("Node update for %s cancelled: no id", node->uname);
 145         free_xml(node_state);
 146         return NULL;
 147     }
 148 
 149     crm_xml_add(node_state, XML_ATTR_UNAME, node->uname);
 150 
 151     if ((flags & node_update_cluster) && node->state) {
 152         pcmk__xe_set_bool_attr(node_state, XML_NODE_IN_CLUSTER,
 153                                pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei));
 154     }
 155 
 156     if (!pcmk_is_set(node->flags, crm_remote_node)) {
 157         if (flags & node_update_peer) {
 158             value = OFFLINESTATUS;
 159             if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
 160                 value = ONLINESTATUS;
 161             }
 162             crm_xml_add(node_state, XML_NODE_IS_PEER, value);
 163         }
 164 
 165         if (flags & node_update_join) {
 166             if (node->join <= crm_join_none) {
 167                 value = CRMD_JOINSTATE_DOWN;
 168             } else {
 169                 value = CRMD_JOINSTATE_MEMBER;
 170             }
 171             crm_xml_add(node_state, XML_NODE_JOIN_STATE, value);
 172         }
 173 
 174         if (flags & node_update_expected) {
 175             crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected);
 176         }
 177     }
 178 
 179     crm_xml_add(node_state, XML_ATTR_ORIGIN, source);
 180 
 181     return node_state;
 182 }
 183 
 184 static void
 185 remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 186                                  xmlNode * output, void *user_data)
 187 {
 188     char *node_uuid = user_data;
 189 
 190     do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE,
 191                         "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)",
 192                         node_uuid, pcmk_strerror(rc), rc);
 193 }
 194 
 195 static void
 196 search_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 197                                  xmlNode * output, void *user_data)
 198 {
 199     char *new_node_uuid = user_data;
 200     xmlNode *node_xml = NULL;
 201 
 202     if (rc != pcmk_ok) {
 203         if (rc != -ENXIO) {
 204             crm_notice("Searching conflicting nodes for %s failed: %s (%d)",
 205                        new_node_uuid, pcmk_strerror(rc), rc);
 206         }
 207         return;
 208 
 209     } else if (output == NULL) {
 210         return;
 211     }
 212 
 213     if (pcmk__str_eq(crm_element_name(output), XML_CIB_TAG_NODE, pcmk__str_casei)) {
 214         node_xml = output;
 215 
 216     } else {
 217         node_xml = pcmk__xml_first_child(output);
 218     }
 219 
 220     for (; node_xml != NULL; node_xml = pcmk__xml_next(node_xml)) {
 221         const char *node_uuid = NULL;
 222         const char *node_uname = NULL;
 223         GHashTableIter iter;
 224         crm_node_t *node = NULL;
 225         gboolean known = FALSE;
 226 
 227         if (!pcmk__str_eq(crm_element_name(node_xml), XML_CIB_TAG_NODE, pcmk__str_casei)) {
 228             continue;
 229         }
 230 
 231         node_uuid = crm_element_value(node_xml, XML_ATTR_ID);
 232         node_uname = crm_element_value(node_xml, XML_ATTR_UNAME);
 233 
 234         if (node_uuid == NULL || node_uname == NULL) {
 235             continue;
 236         }
 237 
 238         g_hash_table_iter_init(&iter, crm_peer_cache);
 239         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 240             if (node->uuid
 241                 && pcmk__str_eq(node->uuid, node_uuid, pcmk__str_casei)
 242                 && node->uname
 243                 && pcmk__str_eq(node->uname, node_uname, pcmk__str_casei)) {
 244 
 245                 known = TRUE;
 246                 break;
 247             }
 248         }
 249 
 250         if (known == FALSE) {
 251             cib_t *cib_conn = controld_globals.cib_conn;
 252             int delete_call_id = 0;
 253             xmlNode *node_state_xml = NULL;
 254 
 255             crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s",
 256                        node_uuid, node_uname, new_node_uuid);
 257 
 258             delete_call_id = cib_conn->cmds->remove(cib_conn, XML_CIB_TAG_NODES,
 259                                                     node_xml, cib_scope_local);
 260             fsa_register_cib_callback(delete_call_id, strdup(node_uuid),
 261                                       remove_conflicting_node_callback);
 262 
 263             node_state_xml = create_xml_node(NULL, XML_CIB_TAG_STATE);
 264             crm_xml_add(node_state_xml, XML_ATTR_ID, node_uuid);
 265             crm_xml_add(node_state_xml, XML_ATTR_UNAME, node_uname);
 266 
 267             delete_call_id = cib_conn->cmds->remove(cib_conn,
 268                                                     XML_CIB_TAG_STATUS,
 269                                                     node_state_xml,
 270                                                     cib_scope_local);
 271             fsa_register_cib_callback(delete_call_id, strdup(node_uuid),
 272                                       remove_conflicting_node_callback);
 273             free_xml(node_state_xml);
 274         }
 275     }
 276 }
 277 
 278 static void
 279 node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 280 {
 281     fsa_data_t *msg_data = NULL;
 282 
 283     if(call_id < pcmk_ok) {
 284         crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 285         crm_log_xml_debug(msg, "update:failed");
 286         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 287 
 288     } else if(rc < pcmk_ok) {
 289         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 290         crm_log_xml_debug(msg, "update:failed");
 291         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 292     }
 293 }
 294 
 295 void
 296 populate_cib_nodes(enum node_update_flags flags, const char *source)
     /* [previous][next][first][last][top][bottom][index][help] */
 297 {
 298     cib_t *cib_conn = controld_globals.cib_conn;
 299 
 300     int call_id = 0;
 301     gboolean from_hashtable = TRUE;
 302     xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES);
 303 
 304 #if SUPPORT_COROSYNC
 305     if (!pcmk_is_set(flags, node_update_quick) && is_corosync_cluster()) {
 306         from_hashtable = pcmk__corosync_add_nodes(node_list);
 307     }
 308 #endif
 309 
 310     if (from_hashtable) {
 311         GHashTableIter iter;
 312         crm_node_t *node = NULL;
 313         GString *xpath = NULL;
 314 
 315         g_hash_table_iter_init(&iter, crm_peer_cache);
 316         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 317             xmlNode *new_node = NULL;
 318 
 319             if ((node->uuid != NULL) && (node->uname != NULL)) {
 320                 crm_trace("Creating node entry for %s/%s", node->uname, node->uuid);
 321                 if (xpath == NULL) {
 322                     xpath = g_string_sized_new(512);
 323                 } else {
 324                     g_string_truncate(xpath, 0);
 325                 }
 326 
 327                 /* We need both to be valid */
 328                 new_node = create_xml_node(node_list, XML_CIB_TAG_NODE);
 329                 crm_xml_add(new_node, XML_ATTR_ID, node->uuid);
 330                 crm_xml_add(new_node, XML_ATTR_UNAME, node->uname);
 331 
 332                 /* Search and remove unknown nodes with the conflicting uname from CIB */
 333                 pcmk__g_strcat(xpath,
 334                                "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION
 335                                "/" XML_CIB_TAG_NODES "/" XML_CIB_TAG_NODE
 336                                "[@" XML_ATTR_UNAME "='", node->uname, "']"
 337                                "[@" XML_ATTR_ID "!='", node->uuid, "']", NULL);
 338 
 339                 call_id = cib_conn->cmds->query(cib_conn,
 340                                                 (const char *) xpath->str,
 341                                                 NULL,
 342                                                 cib_scope_local|cib_xpath);
 343                 fsa_register_cib_callback(call_id, strdup(node->uuid),
 344                                           search_conflicting_node_callback);
 345             }
 346         }
 347 
 348         if (xpath != NULL) {
 349             g_string_free(xpath, TRUE);
 350         }
 351     }
 352 
 353     crm_trace("Populating <nodes> section from %s", from_hashtable ? "hashtable" : "cluster");
 354 
 355     if ((controld_update_cib(XML_CIB_TAG_NODES, node_list, cib_scope_local,
 356                              node_list_update_callback) == pcmk_rc_ok)
 357          && (crm_peer_cache != NULL) && AM_I_DC) {
 358         /*
 359          * There is no need to update the local CIB with our values if
 360          * we've not seen valid membership data
 361          */
 362         GHashTableIter iter;
 363         crm_node_t *node = NULL;
 364 
 365         free_xml(node_list);
 366         node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 367 
 368         g_hash_table_iter_init(&iter, crm_peer_cache);
 369         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 370             create_node_state_update(node, flags, node_list, source);
 371         }
 372 
 373         if (crm_remote_peer_cache) {
 374             g_hash_table_iter_init(&iter, crm_remote_peer_cache);
 375             while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 376                 create_node_state_update(node, flags, node_list, source);
 377             }
 378         }
 379 
 380         controld_update_cib(XML_CIB_TAG_STATUS, node_list, cib_scope_local,
 381                             crmd_node_update_complete);
 382     }
 383     free_xml(node_list);
 384 }
 385 
 386 static void
 387 cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 388 {
 389     fsa_data_t *msg_data = NULL;
 390 
 391     if (rc == pcmk_ok) {
 392         crm_trace("Quorum update %d complete", call_id);
 393 
 394     } else {
 395         crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 396         crm_log_xml_debug(msg, "failed");
 397         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 398     }
 399 }
 400 
 401 void
 402 crm_update_quorum(gboolean quorum, gboolean force_update)
     /* [previous][next][first][last][top][bottom][index][help] */
 403 {
 404     bool has_quorum = pcmk_is_set(controld_globals.flags, controld_has_quorum);
 405 
 406     if (quorum) {
 407         controld_set_global_flags(controld_ever_had_quorum);
 408 
 409     } else if (pcmk_all_flags_set(controld_globals.flags,
 410                                   controld_ever_had_quorum
 411                                   |controld_no_quorum_suicide)) {
 412         pcmk__panic(__func__);
 413     }
 414 
 415     if (AM_I_DC
 416         && ((has_quorum && !quorum) || (!has_quorum && quorum)
 417             || force_update)) {
 418         xmlNode *update = NULL;
 419 
 420         update = create_xml_node(NULL, XML_TAG_CIB);
 421         crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum);
 422         crm_xml_add(update, XML_ATTR_DC_UUID, controld_globals.our_uuid);
 423 
 424         crm_debug("Updating quorum status to %s", pcmk__btoa(quorum));
 425         controld_update_cib(XML_TAG_CIB, update, cib_scope_local,
 426                             cib_quorum_update_complete);
 427         free_xml(update);
 428 
 429         /* Quorum changes usually cause a new transition via other activity:
 430          * quorum gained via a node joining will abort via the node join,
 431          * and quorum lost via a node leaving will usually abort via resource
 432          * activity and/or fencing.
 433          *
 434          * However, it is possible that nothing else causes a transition (e.g.
 435          * someone forces quorum via corosync-cmaptcl, or quorum is lost due to
 436          * a node in standby shutting down cleanly), so here ensure a new
 437          * transition is triggered.
 438          */
 439         if (quorum) {
 440             /* If quorum was gained, abort after a short delay, in case multiple
 441              * nodes are joining around the same time, so the one that brings us
 442              * to quorum doesn't cause all the remaining ones to be fenced.
 443              */
 444             abort_after_delay(INFINITY, pcmk__graph_restart, "Quorum gained",
 445                               5000);
 446         } else {
 447             abort_transition(INFINITY, pcmk__graph_restart, "Quorum lost",
 448                              NULL);
 449         }
 450     }
 451 
 452     if (quorum) {
 453         controld_set_global_flags(controld_has_quorum);
 454     } else {
 455         controld_clear_global_flags(controld_has_quorum);
 456     }
 457 }

/* [previous][next][first][last][top][bottom][index][help] */