root/daemons/controld/controld_join_client.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. update_dc_expected
  2. do_cl_join_query
  3. do_cl_join_announce
  4. do_cl_join_offer_respond
  5. join_query_callback
  6. set_join_state
  7. update_conn_host_cache
  8. do_cl_join_finalize_respond

   1 /*
   2  * Copyright 2004-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/cib.h>
  14 #include <crm/common/xml.h>
  15 
  16 #include <pacemaker-controld.h>
  17 
  18 void join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data);
  19 
  20 extern ha_msg_input_t *copy_ha_msg_input(ha_msg_input_t * orig);
  21 
  22 /*!
  23  * \internal
  24  * \brief Remember if DC is shutting down as we join
  25  *
  26  * If we're joining while the current DC is shutting down, update its expected
  27  * state, so we don't fence it if we become the new DC. (We weren't a peer
  28  * when it broadcast its shutdown request.)
  29  *
  30  * \param[in] msg  A join message from the DC
  31  */
  32 static void
  33 update_dc_expected(const xmlNode *msg)
     /* [previous][next][first][last][top][bottom][index][help] */
  34 {
  35     if ((controld_globals.dc_name != NULL)
  36         && pcmk__xe_attr_is_true(msg, PCMK__XA_DC_LEAVING)) {
  37         pcmk__node_status_t *dc_node =
  38             pcmk__get_node(0, controld_globals.dc_name, NULL,
  39                            pcmk__node_search_cluster_member);
  40 
  41         pcmk__update_peer_expected(__func__, dc_node, CRMD_JOINSTATE_DOWN);
  42     }
  43 }
  44 
  45 /*      A_CL_JOIN_QUERY         */
  46 /* is there a DC out there? */
  47 void
  48 do_cl_join_query(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  49                  enum crmd_fsa_cause cause,
  50                  enum crmd_fsa_state cur_state,
  51                  enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  52 {
  53     xmlNode *req = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD, NULL,
  54                                      CRM_SYSTEM_DC, CRM_OP_JOIN_ANNOUNCE, NULL);
  55 
  56     sleep(1);                   // Give the cluster layer time to propagate to the DC
  57     update_dc(NULL);            /* Unset any existing value so that the result is not discarded */
  58     crm_debug("Querying for a DC");
  59     pcmk__cluster_send_message(NULL, pcmk_ipc_controld, req);
  60     pcmk__xml_free(req);
  61 }
  62 
  63 /*       A_CL_JOIN_ANNOUNCE     */
  64 
  65 /* this is kind of a workaround for the fact that we may not be around or
  66  * are otherwise unable to reply when the DC sends out A_DC_JOIN_OFFER_ALL
  67  */
  68 void
  69 do_cl_join_announce(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  70                     enum crmd_fsa_cause cause,
  71                     enum crmd_fsa_state cur_state,
  72                     enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  73 {
  74     /* don't announce if we're in one of these states */
  75     if (cur_state != S_PENDING) {
  76         crm_warn("Not announcing cluster join because in state %s",
  77                  fsa_state2string(cur_state));
  78         return;
  79     }
  80 
  81     if (!pcmk_is_set(controld_globals.fsa_input_register, R_STARTING)) {
  82         /* send as a broadcast */
  83         xmlNode *req = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD,
  84                                          NULL, CRM_SYSTEM_DC,
  85                                          CRM_OP_JOIN_ANNOUNCE, NULL);
  86 
  87         crm_debug("Announcing availability");
  88         update_dc(NULL);
  89         pcmk__cluster_send_message(NULL, pcmk_ipc_controld, req);
  90         pcmk__xml_free(req);
  91 
  92     } else {
  93         /* Delay announce until we have finished local startup */
  94         crm_warn("Delaying announce of cluster join until local startup is complete");
  95         return;
  96     }
  97 }
  98 
  99 static int query_call_id = 0;
 100 
 101 /*       A_CL_JOIN_REQUEST      */
 102 /* aka. accept the welcome offer */
 103 void
 104 do_cl_join_offer_respond(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 105                          enum crmd_fsa_cause cause,
 106                          enum crmd_fsa_state cur_state,
 107                          enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 108 {
 109     cib_t *cib_conn = controld_globals.cib_conn;
 110 
 111     ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 112     const char *welcome_from;
 113     const char *join_id;
 114 
 115     CRM_CHECK(input != NULL, return);
 116 
 117     welcome_from = crm_element_value(input->msg, PCMK__XA_SRC);
 118     join_id = crm_element_value(input->msg, PCMK__XA_JOIN_ID);
 119     crm_trace("Accepting cluster join offer from node %s " QB_XS " join-%s",
 120               welcome_from, crm_element_value(input->msg, PCMK__XA_JOIN_ID));
 121 
 122     /* we only ever want the last one */
 123     if (query_call_id > 0) {
 124         crm_trace("Cancelling previous join query: %d", query_call_id);
 125         remove_cib_op_callback(query_call_id, FALSE);
 126         query_call_id = 0;
 127     }
 128 
 129     if (update_dc(input->msg) == FALSE) {
 130         crm_warn("Discarding cluster join offer from node %s (expected %s)",
 131                  welcome_from, controld_globals.dc_name);
 132         return;
 133     }
 134 
 135     update_dc_expected(input->msg);
 136 
 137     query_call_id = cib_conn->cmds->query(cib_conn, NULL, NULL,
 138                                           cib_no_children);
 139     fsa_register_cib_callback(query_call_id, pcmk__str_copy(join_id),
 140                               join_query_callback);
 141     crm_trace("Registered join query callback: %d", query_call_id);
 142 
 143     controld_set_fsa_action_flags(A_DC_TIMER_STOP);
 144     controld_trigger_fsa();
 145 }
 146 
 147 void
 148 join_query_callback(xmlNode * msg, int call_id, int rc, xmlNode * output, void *user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 149 {
 150     char *join_id = user_data;
 151     xmlNode *generation = pcmk__xe_create(NULL, PCMK__XE_GENERATION_TUPLE);
 152 
 153     CRM_LOG_ASSERT(join_id != NULL);
 154 
 155     if (query_call_id != call_id) {
 156         crm_trace("Query %d superseded", call_id);
 157         goto done;
 158     }
 159 
 160     query_call_id = 0;
 161     if(rc != pcmk_ok || output == NULL) {
 162         crm_err("Could not retrieve version details for join-%s: %s (%d)",
 163                 join_id, pcmk_strerror(rc), rc);
 164         register_fsa_error_adv(C_FSA_INTERNAL, I_ERROR, NULL, NULL, __func__);
 165 
 166     } else if (controld_globals.dc_name == NULL) {
 167         crm_debug("Membership is in flux, not continuing join-%s", join_id);
 168 
 169     } else {
 170         xmlNode *join_request = NULL;
 171         const pcmk__node_status_t *dc_node =
 172             pcmk__get_node(0, controld_globals.dc_name, NULL,
 173                            pcmk__node_search_cluster_member);
 174 
 175         crm_debug("Respond to join offer join-%s from %s",
 176                   join_id, controld_globals.dc_name);
 177         pcmk__xe_copy_attrs(generation, output, pcmk__xaf_none);
 178 
 179         join_request = pcmk__new_request(pcmk_ipc_controld, CRM_SYSTEM_CRMD,
 180                                          controld_globals.dc_name,
 181                                          CRM_SYSTEM_DC, CRM_OP_JOIN_REQUEST,
 182                                          generation);
 183 
 184         crm_xml_add(join_request, PCMK__XA_JOIN_ID, join_id);
 185         crm_xml_add(join_request, PCMK_XA_CRM_FEATURE_SET, CRM_FEATURE_SET);
 186         pcmk__cluster_send_message(dc_node, pcmk_ipc_controld, join_request);
 187         pcmk__xml_free(join_request);
 188     }
 189 
 190   done:
 191     pcmk__xml_free(generation);
 192 }
 193 
 194 void
 195 set_join_state(const char *start_state, const char *node_name, const char *node_uuid,
     /* [previous][next][first][last][top][bottom][index][help] */
 196                bool remote)
 197 {
 198     if (pcmk__str_eq(start_state, PCMK_VALUE_STANDBY, pcmk__str_casei)) {
 199         crm_notice("Forcing node %s to join in %s state per configured "
 200                    "environment", node_name, start_state);
 201         cib__update_node_attr(controld_globals.logger_out,
 202                               controld_globals.cib_conn, cib_sync_call,
 203                               PCMK_XE_NODES, node_uuid,
 204                               NULL, NULL, NULL, PCMK_NODE_ATTR_STANDBY,
 205                               PCMK_VALUE_TRUE, NULL,
 206                               (remote? PCMK_VALUE_REMOTE : NULL));
 207 
 208     } else if (pcmk__str_eq(start_state, PCMK_VALUE_ONLINE, pcmk__str_casei)) {
 209         crm_notice("Forcing node %s to join in %s state per configured "
 210                    "environment", node_name, start_state);
 211         cib__update_node_attr(controld_globals.logger_out,
 212                               controld_globals.cib_conn, cib_sync_call,
 213                               PCMK_XE_NODES, node_uuid,
 214                               NULL, NULL, NULL, PCMK_NODE_ATTR_STANDBY,
 215                               PCMK_VALUE_FALSE, NULL,
 216                               (remote? PCMK_VALUE_REMOTE : NULL));
 217 
 218     } else if (pcmk__str_eq(start_state, PCMK_VALUE_DEFAULT, pcmk__str_casei)) {
 219         crm_debug("Not forcing a starting state on node %s", node_name);
 220 
 221     } else {
 222         crm_warn("Unrecognized start state '%s', using "
 223                  "'" PCMK_VALUE_DEFAULT "' (%s)",
 224                  start_state, node_name);
 225     }
 226 }
 227 
 228 static int
 229 update_conn_host_cache(xmlNode *node, void *userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 230 {
 231     const char *remote = crm_element_value(node, PCMK_XA_ID);
 232     const char *conn_host = crm_element_value(node, PCMK__XA_CONNECTION_HOST);
 233     const char *state = crm_element_value(node, PCMK__XA_NODE_STATE);
 234 
 235     pcmk__node_status_t *remote_peer =
 236         pcmk__cluster_lookup_remote_node(remote);
 237 
 238     if (remote_peer == NULL) {
 239         return pcmk_rc_ok;
 240     }
 241 
 242     if (conn_host != NULL) {
 243         pcmk__str_update(&remote_peer->conn_host, conn_host);
 244     }
 245 
 246     if (state != NULL) {
 247         pcmk__update_peer_state(__func__, remote_peer, state, 0);
 248     }
 249 
 250     return pcmk_rc_ok;
 251 }
 252 
 253 /*      A_CL_JOIN_RESULT        */
 254 /* aka. this is notification that we have (or have not) been accepted */
 255 void
 256 do_cl_join_finalize_respond(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
 257                             enum crmd_fsa_cause cause,
 258                             enum crmd_fsa_state cur_state,
 259                             enum crmd_fsa_input current_input, fsa_data_t * msg_data)
 260 {
 261     xmlNode *tmp1 = NULL;
 262     gboolean was_nack = TRUE;
 263     static gboolean first_join = TRUE;
 264     ha_msg_input_t *input = fsa_typed_data(fsa_dt_ha_msg);
 265     const char *start_state = pcmk__env_option(PCMK__ENV_NODE_START_STATE);
 266 
 267     int join_id = -1;
 268     const char *op = crm_element_value(input->msg, PCMK__XA_CRM_TASK);
 269     const char *welcome_from = crm_element_value(input->msg, PCMK__XA_SRC);
 270 
 271     if (!pcmk__str_eq(op, CRM_OP_JOIN_ACKNAK, pcmk__str_casei)) {
 272         crm_trace("Ignoring op=%s message", op);
 273         return;
 274     }
 275 
 276     /* calculate if it was an ack or a nack */
 277     if (pcmk__xe_attr_is_true(input->msg, CRM_OP_JOIN_ACKNAK)) {
 278         was_nack = FALSE;
 279     }
 280 
 281     crm_element_value_int(input->msg, PCMK__XA_JOIN_ID, &join_id);
 282 
 283     if (was_nack) {
 284         crm_err("Shutting down because cluster join with leader %s failed "
 285                 QB_XS " join-%d NACK'd", welcome_from, join_id);
 286         register_fsa_error(C_FSA_INTERNAL, I_ERROR, NULL);
 287         controld_set_fsa_input_flags(R_STAYDOWN);
 288         return;
 289     }
 290 
 291     if (!AM_I_DC && controld_is_local_node(welcome_from)) {
 292         crm_warn("Discarding our own welcome - we're no longer the DC");
 293         return;
 294     }
 295 
 296     if (update_dc(input->msg) == FALSE) {
 297         crm_warn("Discarding %s from node %s (expected from %s)",
 298                  op, welcome_from, controld_globals.dc_name);
 299         return;
 300     }
 301 
 302     update_dc_expected(input->msg);
 303 
 304     /* record the node's feature set as a transient attribute */
 305     update_attrd(controld_globals.cluster->priv->node_name,
 306                  CRM_ATTR_FEATURE_SET, CRM_FEATURE_SET, NULL, FALSE);
 307 
 308     /* send our status section to the DC */
 309     tmp1 = controld_query_executor_state();
 310     if (tmp1 != NULL) {
 311         xmlNode *remotes = NULL;
 312         xmlNode *join_confirm = pcmk__new_request(pcmk_ipc_controld,
 313                                                   CRM_SYSTEM_CRMD,
 314                                                   controld_globals.dc_name,
 315                                                   CRM_SYSTEM_DC,
 316                                                   CRM_OP_JOIN_CONFIRM, tmp1);
 317         const pcmk__node_status_t *dc_node =
 318             pcmk__get_node(0, controld_globals.dc_name, NULL,
 319                            pcmk__node_search_cluster_member);
 320 
 321         crm_xml_add_int(join_confirm, PCMK__XA_JOIN_ID, join_id);
 322 
 323         crm_debug("Confirming join-%d: sending local operation history to %s",
 324                   join_id, controld_globals.dc_name);
 325 
 326         /*
 327          * If this is the node's first join since the controller started on it,
 328          * set its initial state (standby or member) according to the user's
 329          * preference.
 330          *
 331          * We do not clear the LRM history here. Even if the DC failed to do it
 332          * when we last left, removing them here creates a race condition if the
 333          * controller is being recovered. Instead of a list of active resources
 334          * from the executor, we may end up with a blank status section. If we
 335          * are _NOT_ lucky, we will probe for the "wrong" instance of anonymous
 336          * clones and end up with multiple active instances on the machine.
 337          */
 338         if (first_join
 339             && !pcmk_is_set(controld_globals.fsa_input_register, R_SHUTDOWN)) {
 340 
 341             first_join = FALSE;
 342             if (start_state) {
 343                 set_join_state(start_state,
 344                                controld_globals.cluster->priv->node_name,
 345                                controld_globals.our_uuid, false);
 346             }
 347         }
 348 
 349         pcmk__cluster_send_message(dc_node, pcmk_ipc_controld, join_confirm);
 350         pcmk__xml_free(join_confirm);
 351 
 352         if (AM_I_DC == FALSE) {
 353             register_fsa_input_adv(cause, I_NOT_DC, NULL, A_NOTHING, TRUE,
 354                                    __func__);
 355         }
 356 
 357         pcmk__xml_free(tmp1);
 358 
 359         /* Update the remote node cache with information about which node
 360          * is hosting the connection.
 361          */
 362         remotes = pcmk__xe_first_child(input->msg, PCMK_XE_NODES, NULL, NULL);
 363         if (remotes != NULL) {
 364             pcmk__xe_foreach_child(remotes, PCMK_XE_NODE,
 365                                    update_conn_host_cache, NULL);
 366         }
 367 
 368     } else {
 369         crm_err("Could not confirm join-%d with %s: Local operation history "
 370                 "failed", join_id, controld_globals.dc_name);
 371         register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
 372     }
 373 }

/* [previous][next][first][last][top][bottom][index][help] */