root/daemons/controld/controld_join_client.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_dc_expected
  2. do_cl_join_query
  3. do_cl_join_announce
  4. do_cl_join_offer_respond
  5. join_query_callback
  6. set_join_state
  7. do_cl_join_finalize_respond

   1 /*
   2  * Copyright 2004-2022 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/cib.h>
  14 #include <crm/msg_xml.h>
  15 #include <crm/common/xml.h>
  16 
  17 #include <pacemaker-controld.h>
  18 
  19 extern pcmk__output_t *logger_out;
  20 
  21 int reannounce_count = 0;
  22 void join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data);
  23 
  24 extern ha_msg_input_t *copy_ha_msg_input(ha_msg_input_t * orig);
  25 
  26 /*!
  27  * \internal
  28  * \brief Remember if DC is shutting down as we join
  29  *
  30  * If we're joining while the current DC is shutting down, update its expected
  31  * state, so we don't fence it if we become the new DC. (We weren't a peer
  32  * when it broadcast its shutdown request.)
  33  *
  34  * \param[in] msg  A join message from the DC
  35  */
  36 static void
  37 update_dc_expected(xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
  38 {
  39     if (fsa_our_dc && pcmk__xe_attr_is_true(msg, F_CRM_DC_LEAVING)) {
  40         crm_node_t *dc_node = crm_get_peer(0, fsa_our_dc);
  41 
  42         pcmk__update_peer_expected(__func__, dc_node, CRMD_JOINSTATE_DOWN);
  43     }
  44 }
  45 
  46 /*      A_CL_JOIN_QUERY         */
  47 /* is there a DC out there? */
  48 void
  49 do_cl_join_query(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  50                  enum crmd_fsa_cause cause,
  51                  enum crmd_fsa_state cur_state,
  52                  enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  53 {
  54     xmlNode *req = create_request(CRM_OP_JOIN_ANNOUNCE, NULL, NULL,
  55                                   CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
  56 
  57     sleep(1);                   // Give the cluster layer time to propagate to the DC
  58     update_dc(NULL);            /* Unset any existing value so that the result is not discarded */
  59     crm_debug("Querying for a DC");
  60     send_cluster_message(NULL, crm_msg_crmd, req, FALSE);
  61     free_xml(req);
  62 }
  63 
  64 /*       A_CL_JOIN_ANNOUNCE     */
  65 
  66 /* this is kind of a workaround for the fact that we may not be around or
  67  * are otherwise unable to reply when the DC sends out A_DC_JOIN_OFFER_ALL
  68  */
  69 void
  70 do_cl_join_announce(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  71                     enum crmd_fsa_cause cause,
  72                     enum crmd_fsa_state cur_state,
  73                     enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  74 {
  75     /* don't announce if we're in one of these states */
  76     if (cur_state != S_PENDING) {
  77         crm_warn("Not announcing cluster join because in state %s",
  78                  fsa_state2string(cur_state));
  79         return;
  80     }
  81 
  82     if (AM_I_OPERATIONAL) {
  83         /* send as a broadcast */
  84         xmlNode *req = create_request(CRM_OP_JOIN_ANNOUNCE, NULL, NULL,
  85                                       CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
  86 
  87         crm_debug("Announcing availability");
  88         update_dc(NULL);
  89         send_cluster_message(NULL, crm_msg_crmd, req, FALSE);
  90         free_xml(req);
  91 
  92     } else {
  93         /* Delay announce until we have finished local startup */
  94         crm_warn("Delaying announce of cluster join until local startup is complete");
  95         return;
  96     }
  97 }
  98 
  99 static int query_call_id = 0;
 100 
 101 /*       A_CL_JOIN_REQUEST      */
 102 /* aka. accept the welcome offer */
 103 void
 104 do_cl_join_offer_respond(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 105                          enum crmd_fsa_cause cause,
 106                          enum crmd_fsa_state cur_state,
 107                          enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 108 {
 109     ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 110     const char *welcome_from;
 111     const char *join_id;
 112 
 113     CRM_CHECK(input != NULL, return);
 114 
 115 #if 0
 116     if (we are sick) {
 117         log error;
 118 
 119         /* save the request for later? */
 120         return;
 121     }
 122 #endif
 123 
 124     welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);
 125     join_id = crm_element_value(input->msg, F_CRM_JOIN_ID);
 126     crm_trace("Accepting cluster join offer from node %s "CRM_XS" join-%s",
 127               welcome_from, crm_element_value(input->msg, F_CRM_JOIN_ID));
 128 
 129     /* we only ever want the last one */
 130     if (query_call_id > 0) {
 131         crm_trace("Cancelling previous join query: %d", query_call_id);
 132         remove_cib_op_callback(query_call_id, FALSE);
 133         query_call_id = 0;
 134     }
 135 
 136     if (update_dc(input->msg) == FALSE) {
 137         crm_warn("Discarding cluster join offer from node %s (expected %s)",
 138                  welcome_from, fsa_our_dc);
 139         return;
 140     }
 141 
 142     update_dc_expected(input->msg);
 143 
 144     query_call_id =
 145         fsa_cib_conn->cmds->query(fsa_cib_conn, NULL, NULL, cib_scope_local | cib_no_children);
 146     fsa_register_cib_callback(query_call_id, FALSE, strdup(join_id), join_query_callback);
 147     crm_trace("Registered join query callback: %d", query_call_id);
 148 
 149     controld_set_fsa_action_flags(A_DC_TIMER_STOP);
 150     trigger_fsa();
 151 }
 152 
 153 void
 154 join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 155 {
 156     char *join_id = user_data;
 157     xmlNode *generation = create_xml_node(NULL, XML_CIB_TAG_GENERATION_TUPPLE);
 158 
 159     CRM_LOG_ASSERT(join_id != NULL);
 160 
 161     if (query_call_id != call_id) {
 162         crm_trace("Query %d superseded", call_id);
 163         goto done;
 164     }
 165 
 166     query_call_id = 0;
 167     if(rc != pcmk_ok || output == NULL) {
 168         crm_err("Could not retrieve version details for join-%s: %s (%d)",
 169                 join_id, pcmk_strerror(rc), rc);
 170         register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__);
 171 
 172     } else if (fsa_our_dc == NULL) {
 173         crm_debug("Membership is in flux, not continuing join-%s", join_id);
 174 
 175     } else {
 176         xmlNode *reply = NULL;
 177 
 178         crm_debug("Respond to join offer join-%s from %s", join_id, fsa_our_dc);
 179         copy_in_properties(generation, output);
 180 
 181         reply = create_request(CRM_OP_JOIN_REQUEST, generation, fsa_our_dc,
 182                                CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
 183 
 184         crm_xml_add(reply, F_CRM_JOIN_ID, join_id);
 185         crm_xml_add(reply, XML_ATTR_CRM_VERSION, CRM_FEATURE_SET);
 186         send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, reply, TRUE);
 187         free_xml(reply);
 188     }
 189 
 190   done:
 191     free_xml(generation);
 192 }
 193 
 194 static void
 195 set_join_state(const char * start_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 196 {
 197     if (pcmk__str_eq(start_state, "standby", pcmk__str_casei)) {
 198         crm_notice("Forcing node %s to join in %s state per configured environment",
 199                    fsa_our_uname, start_state);
 200         cib__update_node_attr(logger_out, fsa_cib_conn, cib_sync_call,
 201                               XML_CIB_TAG_NODES, fsa_our_uuid, NULL, NULL,
 202                               NULL, "standby", "on", NULL, NULL);
 203 
 204     } else if (pcmk__str_eq(start_state, "online", pcmk__str_casei)) {
 205         crm_notice("Forcing node %s to join in %s state per configured environment",
 206                    fsa_our_uname, start_state);
 207         cib__update_node_attr(logger_out, fsa_cib_conn, cib_sync_call,
 208                               XML_CIB_TAG_NODES, fsa_our_uuid, NULL, NULL,
 209                               NULL, "standby", "off", NULL, NULL);
 210 
 211     } else if (pcmk__str_eq(start_state, "default", pcmk__str_casei)) {
 212         crm_debug("Not forcing a starting state on node %s", fsa_our_uname);
 213 
 214     } else {
 215         crm_warn("Unrecognized start state '%s', using 'default' (%s)",
 216                  start_state, fsa_our_uname);
 217     }
 218 }
 219 
 220 /*      A_CL_JOIN_RESULT        */
 221 /* aka. this is notification that we have (or have not) been accepted */
 222 void
 223 do_cl_join_finalize_respond(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 224                             enum crmd_fsa_cause cause,
 225                             enum crmd_fsa_state cur_state,
 226                             enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 227 {
 228     xmlNode *tmp1 = NULL;
 229     gboolean was_nack = TRUE;
 230     static gboolean first_join = TRUE;
 231     ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 232     const char *start_state = pcmk__env_option(PCMK__ENV_NODE_START_STATE);
 233 
 234     int join_id = -1;
 235     const char *op = crm_element_value(input->msg, F_CRM_TASK);
 236     const char *welcome_from = crm_element_value(input->msg, F_CRM_HOST_FROM);
 237 
 238     if (!pcmk__str_eq(op, CRM_OP_JOIN_ACKNAK, pcmk__str_casei)) {
 239         crm_trace("Ignoring op=%s message", op);
 240         return;
 241     }
 242 
 243     /* calculate if it was an ack or a nack */
 244     if (pcmk__xe_attr_is_true(input->msg, CRM_OP_JOIN_ACKNAK)) {
 245         was_nack = FALSE;
 246     }
 247 
 248     crm_element_value_int(input->msg, F_CRM_JOIN_ID, &join_id);
 249 
 250     if (was_nack) {
 251         crm_err("Shutting down because cluster join with leader %s failed "
 252                 CRM_XS" join-%d NACK'd", welcome_from, join_id);
 253         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 254         return;
 255     }
 256 
 257     if (AM_I_DC == FALSE && pcmk__str_eq(welcome_from, fsa_our_uname, pcmk__str_casei)) {
 258         crm_warn("Discarding our own welcome - we're no longer the DC");
 259         return;
 260     }
 261 
 262     if (update_dc(input->msg) == FALSE) {
 263         crm_warn("Discarding %s from node %s (expected from %s)",
 264                  op, welcome_from, fsa_our_dc);
 265         return;
 266     }
 267 
 268     update_dc_expected(input->msg);
 269 
 270     /* send our status section to the DC */
 271     tmp1 = controld_query_executor_state(fsa_our_uname);
 272     if (tmp1 != NULL) {
 273         xmlNode *reply = create_request(CRM_OP_JOIN_CONFIRM, tmp1, fsa_our_dc,
 274                                         CRM_SYSTEM_DC, CRM_SYSTEM_CRMD, NULL);
 275 
 276         crm_xml_add_int(reply, F_CRM_JOIN_ID, join_id);
 277 
 278         crm_debug("Confirming join-%d: sending local operation history to %s",
 279                   join_id, fsa_our_dc);
 280 
 281         /*
 282          * If this is the node's first join since the controller started on it,
 283          * set its initial state (standby or member) according to the user's
 284          * preference.
 285          *
 286          * We do not clear the LRM history here. Even if the DC failed to do it
 287          * when we last left, removing them here creates a race condition if the
 288          * controller is being recovered. Instead of a list of active resources
 289          * from the executor, we may end up with a blank status section. If we
 290          * are _NOT_ lucky, we will probe for the "wrong" instance of anonymous
 291          * clones and end up with multiple active instances on the machine.
 292          */
 293         if (first_join && !pcmk_is_set(fsa_input_register, R_SHUTDOWN)) {
 294             first_join = FALSE;
 295             if (start_state) {
 296                 set_join_state(start_state);
 297             }
 298         }
 299 
 300         send_cluster_message(crm_get_peer(0, fsa_our_dc), crm_msg_crmd, reply, TRUE);
 301         free_xml(reply);
 302 
 303         if (AM_I_DC == FALSE) {
 304             register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE,
 305                                    __func__);
 306         }
 307 
 308         free_xml(tmp1);
 309 
 310     } else {
 311         crm_err("Could not confirm join-%d with %s: Local operation history failed",
 312                 join_id, fsa_our_dc);
 313         register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
 314     }
 315 }

/* [previous][next][first][last][top][bottom][index][help] */