root/crmd/membership.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. reap_dead_nodes
  2. post_cache_update
  3. crmd_node_update_complete
  4. create_node_state_update
  5. remove_conflicting_node_callback
  6. search_conflicting_node_callback
  7. node_list_update_callback
  8. populate_cib_nodes
  9. cib_quorum_update_complete
  10. crm_update_quorum

   1 /*
   2  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This software is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 
  19 /* put these first so that uuid_t is defined without conflicts */
  20 #include <crm_internal.h>
  21 
  22 #include <string.h>
  23 
  24 #include <crm/crm.h>
  25 
  26 #include <crm/msg_xml.h>
  27 #include <crm/common/xml.h>
  28 #include <crm/cluster/internal.h>
  29 #include <crmd_messages.h>
  30 #include <crmd_fsa.h>
  31 #include <crmd_lrm.h>
  32 #include <fsa_proto.h>
  33 #include <crmd_callbacks.h>
  34 #include <tengine.h>
  35 #include <membership.h>
  36 #include <crmd.h>
  37 
  38 gboolean membership_flux_hack = FALSE;
  39 void post_cache_update(int instance);
  40 
  41 int last_peer_update = 0;
  42 guint highest_born_on = -1;
  43 
  44 extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
  45 
  46 static void
  47 reap_dead_nodes(gpointer key, gpointer value, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  48 {
  49     crm_node_t *node = value;
  50 
  51     if (crm_is_peer_active(node) == FALSE) {
  52         crm_update_peer_join(__FUNCTION__, node, crm_join_none);
  53 
  54         if(node && node->uname) {
  55             election_remove(fsa_election, node->uname);
  56 
  57             if (safe_str_eq(fsa_our_uname, node->uname)) {
  58                 crm_err("We're not part of the cluster anymore");
  59                 register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
  60 
  61             } else if (AM_I_DC == FALSE && safe_str_eq(node->uname, fsa_our_dc)) {
  62                 crm_warn("Our DC node (%s) left the cluster", node->uname);
  63                 register_fsa_input(C_FSA_INTERNAL, I_ELECTION, NULL);
  64             }
  65         }
  66 
  67         if (fsa_state == S_INTEGRATION || fsa_state == S_FINALIZE_JOIN) {
  68             check_join_state(fsa_state, __FUNCTION__);
  69         }
  70         if(node && node->uuid) {
  71             fail_incompletable_actions(transition_graph, node->uuid);
  72         }
  73     }
  74 }
  75 
  76 gboolean ever_had_quorum = FALSE;
  77 
  78 void
  79 post_cache_update(int instance)
     /* [previous][next][first][last][top][bottom][index][help] */
  80 {
  81     xmlNode *no_op = NULL;
  82 
  83     crm_peer_seq = instance;
  84     crm_debug("Updated cache after membership event %d.", instance);
  85 
  86     g_hash_table_foreach(crm_peer_cache, reap_dead_nodes, NULL);
  87     set_bit(fsa_input_register, R_MEMBERSHIP);
  88 
  89     if (AM_I_DC) {
  90         populate_cib_nodes(node_update_quick | node_update_cluster | node_update_peer |
  91                            node_update_expected, __FUNCTION__);
  92     }
  93 
  94     /*
  95      * If we lost nodes, we should re-check the election status
  96      * Safe to call outside of an election
  97      */
  98     register_fsa_action(A_ELECTION_CHECK);
  99 
 100     /* Membership changed, remind everyone we're here.
 101      * This will aid detection of duplicate DCs
 102      */
 103     no_op = create_request(CRM_OP_NOOP, NULL, NULL, CRM_SYSTEM_CRMD,
 104                            AM_I_DC ? CRM_SYSTEM_DC : CRM_SYSTEM_CRMD, NULL);
 105     send_cluster_message(NULL, crm_msg_crmd, no_op, FALSE);
 106     free_xml(no_op);
 107 }
 108 
 109 static void
 110 crmd_node_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 111 {
 112     fsa_data_t *msg_data = NULL;
 113 
 114     last_peer_update = 0;
 115 
 116     if (rc == pcmk_ok) {
 117         crm_trace("Node update %d complete", call_id);
 118 
 119     } else if(call_id < pcmk_ok) {
 120         crm_err("Node update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 121         crm_log_xml_debug(msg, "failed");
 122         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 123 
 124     } else {
 125         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 126         crm_log_xml_debug(msg, "failed");
 127         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 128     }
 129 }
 130 
 131 /*!
 132  * \internal
 133  * \brief Create an XML node state tag with updates
 134  *
 135  * \param[in,out] node    Node whose state will be used for update
 136  * \param[in]     flags   Bitmask of node_update_flags indicating what to update
 137  * \param[in,out] parent  XML node to contain update (or NULL)
 138  * \param[in]     source  Who requested the update (only used for logging)
 139  *
 140  * \return Pointer to created node state tag
 141  */
 142 xmlNode *
 143 create_node_state_update(crm_node_t *node, int flags, xmlNode *parent,
     /* [previous][next][first][last][top][bottom][index][help] */
 144                          const char *source)
 145 {
 146     const char *value = NULL;
 147     xmlNode *node_state;
 148 
 149     if (!node->state) {
 150         crm_info("Node update for %s cancelled: no state, not seen yet", node->uname);
 151        return NULL;
 152     }
 153 
 154     node_state = create_xml_node(parent, XML_CIB_TAG_STATE);
 155 
 156     if (is_set(node->flags, crm_remote_node)) {
 157         crm_xml_add(node_state, XML_NODE_IS_REMOTE, XML_BOOLEAN_TRUE);
 158     }
 159 
 160     set_uuid(node_state, XML_ATTR_UUID, node);
 161 
 162     if (crm_element_value(node_state, XML_ATTR_UUID) == NULL) {
 163         crm_info("Node update for %s cancelled: no id", node->uname);
 164         free_xml(node_state);
 165         return NULL;
 166     }
 167 
 168     crm_xml_add(node_state, XML_ATTR_UNAME, node->uname);
 169 
 170     if ((flags & node_update_cluster) && node->state) {
 171         crm_xml_add_boolean(node_state, XML_NODE_IN_CLUSTER,
 172                             safe_str_eq(node->state, CRM_NODE_MEMBER));
 173     }
 174 
 175     if (!is_set(node->flags, crm_remote_node)) {
 176         if (flags & node_update_peer) {
 177             value = OFFLINESTATUS;
 178             if (node->processes & proc_flags) {
 179                 value = ONLINESTATUS;
 180             }
 181             crm_xml_add(node_state, XML_NODE_IS_PEER, value);
 182         }
 183 
 184         if (flags & node_update_join) {
 185             if (node->join <= crm_join_none) {
 186                 value = CRMD_JOINSTATE_DOWN;
 187             } else {
 188                 value = CRMD_JOINSTATE_MEMBER;
 189             }
 190             crm_xml_add(node_state, XML_NODE_JOIN_STATE, value);
 191         }
 192 
 193         if (flags & node_update_expected) {
 194             crm_xml_add(node_state, XML_NODE_EXPECTED, node->expected);
 195         }
 196     }
 197 
 198     crm_xml_add(node_state, XML_ATTR_ORIGIN, source);
 199 
 200     return node_state;
 201 }
 202 
 203 static void
 204 remove_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 205                                  xmlNode * output, void *user_data)
 206 {
 207     char *node_uuid = user_data;
 208 
 209     do_crm_log_unlikely(rc == 0 ? LOG_DEBUG : LOG_NOTICE,
 210                         "Deletion of the unknown conflicting node \"%s\": %s (rc=%d)",
 211                         node_uuid, pcmk_strerror(rc), rc);
 212 }
 213 
 214 static void
 215 search_conflicting_node_callback(xmlNode * msg, int call_id, int rc,
     /* [previous][next][first][last][top][bottom][index][help] */
 216                                  xmlNode * output, void *user_data)
 217 {
 218     char *new_node_uuid = user_data;
 219     xmlNode *node_xml = NULL;
 220 
 221     if (rc != pcmk_ok) {
 222         if (rc != -ENXIO) {
 223             crm_notice("Searching conflicting nodes for %s failed: %s (%d)",
 224                        new_node_uuid, pcmk_strerror(rc), rc);
 225         }
 226         return;
 227 
 228     } else if (output == NULL) {
 229         return;
 230     }
 231 
 232     if (safe_str_eq(crm_element_name(output), XML_CIB_TAG_NODE)) {
 233         node_xml = output;
 234 
 235     } else {
 236         node_xml = __xml_first_child(output);
 237     }
 238 
 239     for (; node_xml != NULL; node_xml = __xml_next(node_xml)) {
 240         const char *node_uuid = NULL;
 241         const char *node_uname = NULL;
 242         GHashTableIter iter;
 243         crm_node_t *node = NULL;
 244         gboolean known = FALSE;
 245 
 246         if (safe_str_neq(crm_element_name(node_xml), XML_CIB_TAG_NODE)) {
 247             continue;
 248         }
 249 
 250         node_uuid = crm_element_value(node_xml, XML_ATTR_ID);
 251         node_uname = crm_element_value(node_xml, XML_ATTR_UNAME);
 252 
 253         if (node_uuid == NULL || node_uname == NULL) {
 254             continue;
 255         }
 256 
 257         g_hash_table_iter_init(&iter, crm_peer_cache);
 258         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 259             if (node->uuid
 260                 && safe_str_eq(node->uuid, node_uuid)
 261                 && node->uname
 262                 && safe_str_eq(node->uname, node_uname)) {
 263 
 264                 known = TRUE;
 265                 break;
 266             }
 267         }
 268 
 269         if (known == FALSE) {
 270             int delete_call_id = 0;
 271             xmlNode *node_state_xml = NULL;
 272 
 273             crm_notice("Deleting unknown node %s/%s which has conflicting uname with %s",
 274                        node_uuid, node_uname, new_node_uuid);
 275 
 276             delete_call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, XML_CIB_TAG_NODES, node_xml,
 277                                                         cib_scope_local | cib_quorum_override);
 278             fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid),
 279                                       remove_conflicting_node_callback);
 280 
 281             node_state_xml = create_xml_node(NULL, XML_CIB_TAG_STATE);
 282             crm_xml_add(node_state_xml, XML_ATTR_ID, node_uuid);
 283             crm_xml_add(node_state_xml, XML_ATTR_UNAME, node_uname);
 284 
 285             delete_call_id = fsa_cib_conn->cmds->delete(fsa_cib_conn, XML_CIB_TAG_STATUS, node_state_xml,
 286                                                         cib_scope_local | cib_quorum_override);
 287             fsa_register_cib_callback(delete_call_id, FALSE, strdup(node_uuid),
 288                                       remove_conflicting_node_callback);
 289             free_xml(node_state_xml);
 290         }
 291     }
 292 }
 293 
 294 static void
 295 node_list_update_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 296 {
 297     fsa_data_t *msg_data = NULL;
 298 
 299     if(call_id < pcmk_ok) {
 300         crm_err("Node list update failed: %s (%d)", pcmk_strerror(call_id), call_id);
 301         crm_log_xml_debug(msg, "update:failed");
 302         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 303 
 304     } else if(rc < pcmk_ok) {
 305         crm_err("Node update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 306         crm_log_xml_debug(msg, "update:failed");
 307         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 308     }
 309 }
 310 
 311 #define NODE_PATH_MAX 512
 312 
 313 void
 314 populate_cib_nodes(enum node_update_flags flags, const char *source)
     /* [previous][next][first][last][top][bottom][index][help] */
 315 {
 316     int call_id = 0;
 317     gboolean from_hashtable = TRUE;
 318     int call_options = cib_scope_local | cib_quorum_override;
 319     xmlNode *node_list = create_xml_node(NULL, XML_CIB_TAG_NODES);
 320 
 321 #if SUPPORT_HEARTBEAT
 322     if (is_not_set(flags, node_update_quick) && is_heartbeat_cluster()) {
 323         from_hashtable = heartbeat_initialize_nodelist(fsa_cluster_conn, FALSE, node_list);
 324     }
 325 #endif
 326 
 327 #if SUPPORT_COROSYNC
 328 #  if !SUPPORT_PLUGIN
 329     if (is_not_set(flags, node_update_quick) && is_corosync_cluster()) {
 330         from_hashtable = corosync_initialize_nodelist(NULL, FALSE, node_list);
 331     }
 332 #  endif
 333 #endif
 334 
 335     if (from_hashtable) {
 336         GHashTableIter iter;
 337         crm_node_t *node = NULL;
 338 
 339         g_hash_table_iter_init(&iter, crm_peer_cache);
 340         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 341             xmlNode *new_node = NULL;
 342 
 343             crm_trace("Creating node entry for %s/%s", node->uname, node->uuid);
 344             if(node->uuid && node->uname) {
 345                 char xpath[NODE_PATH_MAX];
 346 
 347                 /* We need both to be valid */
 348                 new_node = create_xml_node(node_list, XML_CIB_TAG_NODE);
 349                 crm_xml_add(new_node, XML_ATTR_ID, node->uuid);
 350                 crm_xml_add(new_node, XML_ATTR_UNAME, node->uname);
 351 
 352                 /* Search and remove unknown nodes with the conflicting uname from CIB */
 353                 snprintf(xpath, NODE_PATH_MAX,
 354                          "/" XML_TAG_CIB "/" XML_CIB_TAG_CONFIGURATION "/" XML_CIB_TAG_NODES
 355                          "/" XML_CIB_TAG_NODE "[@uname='%s'][@id!='%s']",
 356                          node->uname, node->uuid);
 357 
 358                 call_id = fsa_cib_conn->cmds->query(fsa_cib_conn, xpath, NULL,
 359                                                     cib_scope_local | cib_xpath);
 360                 fsa_register_cib_callback(call_id, FALSE, strdup(node->uuid),
 361                                           search_conflicting_node_callback);
 362             }
 363         }
 364     }
 365 
 366     crm_trace("Populating <nodes> section from %s", from_hashtable ? "hashtable" : "cluster");
 367 
 368     fsa_cib_update(XML_CIB_TAG_NODES, node_list, call_options, call_id, NULL);
 369     fsa_register_cib_callback(call_id, FALSE, NULL, node_list_update_callback);
 370 
 371     free_xml(node_list);
 372 
 373     if (call_id >= pcmk_ok && crm_peer_cache != NULL && AM_I_DC) {
 374         /*
 375          * There is no need to update the local CIB with our values if
 376          * we've not seen valid membership data
 377          */
 378         GHashTableIter iter;
 379         crm_node_t *node = NULL;
 380 
 381         node_list = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 382 
 383         g_hash_table_iter_init(&iter, crm_peer_cache);
 384         while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 385             create_node_state_update(node, flags, node_list, source);
 386         }
 387 
 388         if (crm_remote_peer_cache) {
 389             g_hash_table_iter_init(&iter, crm_remote_peer_cache);
 390             while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 391                 create_node_state_update(node, flags, node_list, source);
 392             }
 393         }
 394 
 395         fsa_cib_update(XML_CIB_TAG_STATUS, node_list, call_options, call_id, NULL);
 396         fsa_register_cib_callback(call_id, FALSE, NULL, crmd_node_update_complete);
 397         last_peer_update = call_id;
 398 
 399         free_xml(node_list);
 400     }
 401 }
 402 
 403 static void
 404 cib_quorum_update_complete(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 405 {
 406     fsa_data_t *msg_data = NULL;
 407 
 408     if (rc == pcmk_ok) {
 409         crm_trace("Quorum update %d complete", call_id);
 410 
 411     } else {
 412         crm_err("Quorum update %d failed: %s (%d)", call_id, pcmk_strerror(rc), rc);
 413         crm_log_xml_debug(msg, "failed");
 414         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 415     }
 416 }
 417 
 418 void
 419 crm_update_quorum(gboolean quorum, gboolean force_update)
     /* [previous][next][first][last][top][bottom][index][help] */
 420 {
 421     ever_had_quorum |= quorum;
 422 
 423     if(ever_had_quorum && quorum == FALSE && no_quorum_suicide_escalation) {
 424         pcmk_panic(__FUNCTION__);
 425     }
 426 
 427     if (AM_I_DC && (force_update || fsa_has_quorum != quorum)) {
 428         int call_id = 0;
 429         xmlNode *update = NULL;
 430         int call_options = cib_scope_local | cib_quorum_override;
 431 
 432         update = create_xml_node(NULL, XML_TAG_CIB);
 433         crm_xml_add_int(update, XML_ATTR_HAVE_QUORUM, quorum);
 434         crm_xml_add(update, XML_ATTR_DC_UUID, fsa_our_uuid);
 435 
 436         fsa_cib_update(XML_TAG_CIB, update, call_options, call_id, NULL);
 437         crm_debug("Updating quorum status to %s (call=%d)", quorum ? "true" : "false", call_id);
 438         fsa_register_cib_callback(call_id, FALSE, NULL, cib_quorum_update_complete);
 439         free_xml(update);
 440 
 441         /* If a node not running any resources is cleanly shut down and drops us
 442          * below quorum, we won't necessarily abort the transition, so abort it
 443          * here to be safe.
 444          */
 445         if (quorum == FALSE) {
 446             abort_transition(INFINITY, tg_restart, "Quorum loss", NULL);
 447         }
 448     }
 449     fsa_has_quorum = quorum;
 450 }

/* [previous][next][first][last][top][bottom][index][help] */