root/daemons/controld/controld_membership.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. reap_dead_nodes
  2. post_cache_update
  3. crmd_node_update_complete
  4. create_node_state_update
  5. remove_conflicting_node_callback
  6. search_conflicting_node_callback
  7. node_list_update_callback
  8. populate_cib_nodes
  9. cib_quorum_update_complete
  10. crm_update_quorum

   1 /*
   2  * Copyright 2004-2020 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 /* put these first so that uuid_t is defined without conflicts */
  11 #include <crm_internal.h>
  12 
  13 #include <string.h>
  14 
  15 #include <crm/crm.h>
  16 #include <crm/msg_xml.h>
  17 #include <crm/common/xml.h>
  18 #include <crm/common/xml_internal.h>
  19 #include <crm/cluster/internal.h>
  20 
  21 #include <pacemaker-controld.h>
  22 
  23 gboolean membership_flux_hack = FALSE;
  24 void post_cache_update(int instance);
  25 
  26 int last_peer_update = 0;
  27 guint highest_born_on = -1;
  28 
  29 extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
  30 
  31 static void
  32 reap_dead_nodes(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  33 {
  34     crm_node_t *node = value;
  35 
  36     if (crm_is_peer_active(node) == FALSE) {
  37         crm_update_peer_join(__func__, node, crm_join_none);
  38 
  39         if(node && node->uname) {
  40             if (pcmk__str_eq(fsa_our_uname, node->uname, pcmk__str_casei)) {
  41                 crm_err("We're not part of the cluster anymore");
  42                 register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
  43 
  44             } else if (AM_I_DC == FALSE && pcmk__str_eq(node->uname, fsa_our_dc, pcmk__str_casei)) {
  45                 crm_warn("Our DC node (%s) left the cluster", node->uname);
  46                 register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
  47             }
  48         }
  49 
  50         if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) {
  51             check_join_state(fsa_state, __func__);
  52         }
  53         if(node && node->uuid) {
  54             fail_incompletable_actions(transition_graph, node->uuid);
  55         }
  56     }
  57 }
  58 
  59 gboolean ever_had_quorum = FALSE;
  60 
  61 void
  62 post_cache_update(int instance)
     /* [previous][next][first][last][top][bottom][index][help] */
  63 {
  64     xmlNode *no_op = NULL;
  65 
  66     crm_peer_seq = instance;
  67     crm_debug("Updated cache after membership event %d.", instance);
  68 
  69     g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL);
  70     controld_set_fsa_input_flags(R_MEMBERSHIP);
  71 
  72     if (AM_I_DC) {
  73         populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer |
  74                            node_update_expected, __func__);
  75     }
  76 
  77     /*
  78      * If we lost nodes, we should re-check the election status
  79      * Safe to call outside of an election
  80      */
  81     controld_set_fsa_action_flags(A_ELECTION_CHECK);
  82     trigger_fsa();
  83 
  84     /* Membership changed, remind everyone we're here.
  85      * This will aid detection of duplicate DCs
  86      */
  87     no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD,
  88                            AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL);
  89     send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE);
  90     free_xml(no_op);
  91 }
  92 
  93 static void
  94 crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  95 {
  96     fsa_data_t *msg_data = NULL;
  97 
  98     last_peer_update = 0;
  99 
 100     if (rc == pcmk_ok) {
 101         crm_trace("Node update %d complete", call_id);
 102 
 103     } else if(call_id < pcmk_ok) {
 104         crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 105         crm_log_xml_debug(msg, "failed");
 106         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 107 
 108     } else {
 109         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 110         crm_log_xml_debug(msg, "failed");
 111         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 112     }
 113 }
 114 
 115 /*!
 116  * \internal
 117  * \brief Create an XML node state tag with updates
 118  *
 119  * \param[in,out] node    Node whose state will be used for update
 120  * \param[in]     flags   Bitmask of node_update_flags indicating what to update
 121  * \param[in,out] parent  XML node to contain update (or NULL)
 122  * \param[in]     source  Who requested the update (only used for logging)
 123  *
 124  * \return Pointer to created node state tag
 125  */
 126 xmlNode *
 127 create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
     /* [previous][next][first][last][top][bottom][index][help] */
 128                          const char *source)
 129 {
 130     const char *value = NULL;
 131     xmlNode *node_state;
 132 
 133     if (!node->state) {
 134         crm_info("Node update for %s cancelled: no state, not seen yet", node->uname);
 135        return NULL;
 136     }
 137 
 138     node_state = create_xml_node(parent, XML_CIB_TAG_STATE);
 139 
 140     if (pcmk_is_set(node->flags, crm_remote_node)) {
 141         crm_xml_add(node_state, XML_NODE_IS_REMOTE, XML_BOOLEAN_TRUE);
 142     }
 143 
 144     set_uuid(node_state, XML_ATTR_UUID, node);
 145 
 146     if (crm_element_value(node_state, XML_ATTR_UUID) == NULL) {
 147         crm_info("Node update for %s cancelled: no id", node->uname);
 148         free_xml(node_state);
 149         return NULL;
 150     }
 151 
 152     crm_xml_add(node_state, XML_ATTR_UNAME, node->uname);
 153 
 154     if ((flags & node_update_cluster) && node->state) {
 155         crm_xml_add_boolean(node_state, XML_NODE_IN_CLUSTER,
 156                             pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei));
 157     }
 158 
 159     if (!pcmk_is_set(node->flags, crm_remote_node)) {
 160         if (flags & node_update_peer) {
 161             value = OFFLINESTATUS;
 162             if (pcmk_is_set(node->processes, crm_get_cluster_proc())) {
 163                 value = ONLINESTATUS;
 164             }
 165             crm_xml_add(node_state, XML_NODE_IS_PEER, value);
 166         }
 167 
 168         if (flags & node_update_join) {
 169             if (node->join <= crm_join_none) {
 170                 value = CRMD_JOINSTATE_DOWN;
 171             } else {
 172                 value = CRMD_JOINSTATE_MEMBER;
 173             }
 174             crm_xml_add(node_state, XML_NODE_JOIN_STATE, value);
 175         }
 176 
 177         if (flags & node_update_expected) {
 178             crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected);
 179         }
 180     }
 181 
 182     crm_xml_add(node_state, XML_ATTR_ORIGIN, source);
 183 
 184     return node_state;
 185 }
 186 
 187 static void
 188 remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 189                                  xmlNode * output, void *user_data)
 190 {
 191     char *node_uuid = user_data;
 192 
 193     do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE,
 194                         "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)",
 195                         node_uuid, pcmk_strerror(rc), rc);
 196 }
 197 
 198 static void
 199 search_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 200                                  xmlNode * output, void *user_data)
 201 {
 202     char *new_node_uuid = user_data;
 203     xmlNode *node_xml = NULL;
 204 
 205     if (rc != pcmk_ok) {
 206         if (rc != -ENXIO) {
 207             crm_notice("Searching conflicting nodes for %s failed: %s (%d)",
 208                        new_node_uuid, pcmk_strerror(rc), rc);
 209         }
 210         return;
 211 
 212     } else if (output == NULL) {
 213         return;
 214     }
 215 
 216     if (pcmk__str_eq(crm_element_name(output), XML_CIB_TAG_NODE, pcmk__str_casei)) {
 217         node_xml = output;
 218 
 219     } else {
 220         node_xml = pcmk__xml_first_child(output);
 221     }
 222 
 223     for (; node_xml != NULL; node_xml = pcmk__xml_next(node_xml)) {
 224         const char *node_uuid = NULL;
 225         const char *node_uname = NULL;
 226         GHashTableIter iter;
 227         crm_node_t *node = NULL;
 228         gboolean known = FALSE;
 229 
 230         if (!pcmk__str_eq(crm_element_name(node_xml), XML_CIB_TAG_NODE, pcmk__str_casei)) {
 231             continue;
 232         }
 233 
 234         node_uuid = crm_element_value(node_xml, XML_ATTR_ID);
 235         node_uname = crm_element_value(node_xml, XML_ATTR_UNAME);
 236 
 237         if (node_uuid == NULL || node_uname == NULL) {
 238             continue;
 239         }
 240 
 241         g_hash_table_iter_init(&iter, crm_peer_cache);
 242         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 243             if (node->uuid
 244                 && pcmk__str_eq(node->uuid, node_uuid, pcmk__str_casei)
 245                 && node->uname
 246                 && pcmk__str_eq(node->uname, node_uname, pcmk__str_casei)) {
 247 
 248                 known = TRUE;
 249                 break;
 250             }
 251         }
 252 
 253         if (known == FALSE) {
 254             int delete_call_id = 0;
 255             xmlNode *node_state_xml = NULL;
 256 
 257             crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s",
 258                        node_uuid, node_uname, new_node_uuid);
 259 
 260             delete_call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_NODES, node_xml,
 261                                                         cib_scope_local | cib_quorum_override);
 262             fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid),
 263                                       remove_conflicting_node_callback);
 264 
 265             node_state_xml = create_xml_node(NULL, XML_CIB_TAG_STATE);
 266             crm_xml_add(node_state_xml, XML_ATTR_ID, node_uuid);
 267             crm_xml_add(node_state_xml, XML_ATTR_UNAME, node_uname);
 268 
 269             delete_call_id = fsa_cib_conn->cmds->remove(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state_xml,
 270                                                         cib_scope_local | cib_quorum_override);
 271             fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid),
 272                                       remove_conflicting_node_callback);
 273             free_xml(node_state_xml);
 274         }
 275     }
 276 }
 277 
 278 static void
 279 node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 280 {
 281     fsa_data_t *msg_data = NULL;
 282 
 283     if(call_id < pcmk_ok) {
 284         crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 285         crm_log_xml_debug(msg, "update:failed");
 286         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 287 
 288     } else if(rc < pcmk_ok) {
 289         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 290         crm_log_xml_debug(msg, "update:failed");
 291         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 292     }
 293 }
 294 
 295 #define NODE_PATH_MAX 512
 296 
 297 void
 298 populate_cib_nodes(enum node_update_flags flags, const char *source)
     /* [previous][next][first][last][top][bottom][index][help] */
 299 {
 300     int call_id = 0;
 301     gboolean from_hashtable = TRUE;
 302     int call_options = cib_scope_local | cib_quorum_override;
 303     xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES);
 304 
 305 #if SUPPORT_COROSYNC
 306     if (!pcmk_is_set(flags, node_update_quick) && is_corosync_cluster()) {
 307         from_hashtable = corosync_initialize_nodelist(NULL, FALSE, node_list);
 308     }
 309 #endif
 310 
 311     if (from_hashtable) {
 312         GHashTableIter iter;
 313         crm_node_t *node = NULL;
 314 
 315         g_hash_table_iter_init(&iter, crm_peer_cache);
 316         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 317             xmlNode *new_node = NULL;
 318 
 319             crm_trace("Creating node entry for %s/%s", node->uname, node->uuid);
 320             if(node->uuid && node->uname) {
 321                 char xpath[NODE_PATH_MAX];
 322 
 323                 /* We need both to be valid */
 324                 new_node = create_xml_node(node_list, XML_CIB_TAG_NODE);
 325                 crm_xml_add(new_node, XML_ATTR_ID, node->uuid);
 326                 crm_xml_add(new_node, XML_ATTR_UNAME, node->uname);
 327 
 328                 /* Search and remove unknown nodes with the conflicting uname from CIB */
 329                 snprintf(xpath, NODE_PATH_MAX,
 330                          "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_NODES
 331                          "/" XML_CIB_TAG_NODE "[@uname='%s'][@id!='%s']",
 332                          node->uname, node->uuid);
 333 
 334                 call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, xpath, NULL,
 335                                                     cib_scope_local | cib_xpath);
 336                 fsa_register_cib_callback(call_id, FALSE, strdup(node->uuid),
 337                                           search_conflicting_node_callback);
 338             }
 339         }
 340     }
 341 
 342     crm_trace("Populating <nodes> section from %s", from_hashtable ? "hashtable" : "cluster");
 343 
 344     fsa_cib_update(XML_CIB_TAG_NODES, node_list, call_options, call_id, NULL);
 345     fsa_register_cib_callback(call_id, FALSE, NULL, node_list_update_callback);
 346 
 347     free_xml(node_list);
 348 
 349     if (call_id >= pcmk_ok && crm_peer_cache != NULL && AM_I_DC) {
 350         /*
 351          * There is no need to update the local CIB with our values if
 352          * we've not seen valid membership data
 353          */
 354         GHashTableIter iter;
 355         crm_node_t *node = NULL;
 356 
 357         node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 358 
 359         g_hash_table_iter_init(&iter, crm_peer_cache);
 360         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 361             create_node_state_update(node, flags, node_list, source);
 362         }
 363 
 364         if (crm_remote_peer_cache) {
 365             g_hash_table_iter_init(&iter, crm_remote_peer_cache);
 366             while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 367                 create_node_state_update(node, flags, node_list, source);
 368             }
 369         }
 370 
 371         fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL);
 372         fsa_register_cib_callback(call_id, FALSE, NULL, crmd_node_update_complete);
 373         last_peer_update = call_id;
 374 
 375         free_xml(node_list);
 376     }
 377 }
 378 
 379 static void
 380 cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 381 {
 382     fsa_data_t *msg_data = NULL;
 383 
 384     if (rc == pcmk_ok) {
 385         crm_trace("Quorum update %d complete", call_id);
 386 
 387     } else {
 388         crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 389         crm_log_xml_debug(msg, "failed");
 390         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 391     }
 392 }
 393 
 394 void
 395 crm_update_quorum(gboolean quorum, gboolean force_update)
     /* [previous][next][first][last][top][bottom][index][help] */
 396 {
 397     ever_had_quorum |= quorum;
 398 
 399     if(ever_had_quorum && quorum == FALSE && no_quorum_suicide_escalation) {
 400         pcmk__panic(__func__);
 401     }
 402 
 403     if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) {
 404         int call_id = 0;
 405         xmlNode *update = NULL;
 406         int call_options = cib_scope_local | cib_quorum_override;
 407 
 408         update = create_xml_node(NULL, XML_TAG_CIB);
 409         crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum);
 410         crm_xml_add(update, XML_ATTR_DC_UUID, fsa_our_uuid);
 411 
 412         fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL);
 413         crm_debug("Updating quorum status to %s (call=%d)",
 414                   pcmk__btoa(quorum), call_id);
 415         fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete);
 416         free_xml(update);
 417 
 418         /* Quorum changes usually cause a new transition via other activity:
 419          * quorum gained via a node joining will abort via the node join,
 420          * and quorum lost via a node leaving will usually abort via resource
 421          * activity and/or fencing.
 422          *
 423          * However, it is possible that nothing else causes a transition (e.g.
 424          * someone forces quorum via corosync-cmaptcl, or quorum is lost due to
 425          * a node in standby shutting down cleanly), so here ensure a new
 426          * transition is triggered.
 427          */
 428         if (quorum) {
 429             /* If quorum was gained, abort after a short delay, in case multiple
 430              * nodes are joining around the same time, so the one that brings us
 431              * to quorum doesn't cause all the remaining ones to be fenced.
 432              */
 433             abort_after_delay(INFINITY, tg_restart, "Quorum gained", 5000);
 434         } else {
 435             abort_transition(INFINITY, tg_restart, "Quorum lost", NULL);
 436         }
 437     }
 438     fsa_has_quorum = quorum;
 439 }

/* [previous][next][first][last][top][bottom][index][help] */