root/daemons/controld/controld_callbacks.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. crmd_ha_msg_filter
  2. node_alive
  3. peer_update_callback
  4. crmd_cib_connection_destroy
  5. crm_fsa_trigger

   1 /*
   2  * Copyright 2004-2020 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/param.h>
  13 #include <string.h>
  14 
  15 #include <crm/crm.h>
  16 #include <crm/msg_xml.h>
  17 #include <crm/common/xml.h>
  18 #include <crm/cluster.h>
  19 #include <crm/cib.h>
  20 
  21 #include <pacemaker-controld.h>
  22 
  23 /* From join_dc... */
  24 extern gboolean check_join_state(enum crmd_fsa_state cur_state, const char *source);
  25 
  26 void
  27 crmd_ha_msg_filter(xmlNode * msg)
     /* [previous][next][first][last][top][bottom][index][help] */
  28 {
  29     if (AM_I_DC) {
  30         const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
  31 
  32         if (pcmk__str_eq(sys_from, CRM_SYSTEM_DC, pcmk__str_casei)) {
  33             const char *from = crm_element_value(msg, F_ORIG);
  34 
  35             if (!pcmk__str_eq(from, fsa_our_uname, pcmk__str_casei)) {
  36                 int level = LOG_INFO;
  37                 const char *op = crm_element_value(msg, F_CRM_TASK);
  38 
  39                 /* make sure the election happens NOW */
  40                 if (fsa_state != S_ELECTION) {
  41                     ha_msg_input_t new_input;
  42 
  43                     level = LOG_WARNING;
  44                     new_input.msg = msg;
  45                     register_fsa_error_adv(C_FSA_INTERNAL, I_ELECTION, NULL, &new_input,
  46                                            __func__);
  47                 }
  48 
  49                 do_crm_log(level, "Another DC detected: %s (op=%s)", from, op);
  50                 goto done;
  51             }
  52         }
  53 
  54     } else {
  55         const char *sys_to = crm_element_value(msg, F_CRM_SYS_TO);
  56 
  57         if (pcmk__str_eq(sys_to, CRM_SYSTEM_DC, pcmk__str_casei)) {
  58             return;
  59         }
  60     }
  61 
  62     /* crm_log_xml_trace("HA[inbound]", msg); */
  63     route_message(C_HA_MESSAGE, msg);
  64 
  65   done:
  66     trigger_fsa();
  67 }
  68 
  69 /*!
  70  * \internal
  71  * \brief Check whether a node is online
  72  *
  73  * \param[in] node  Node to check
  74  *
  75  * \retval -1 if completely dead
  76  * \retval  0 if partially alive
  77  * \retval  1 if completely alive
  78  */
  79 static int
  80 node_alive(const crm_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
  81 {
  82     if (pcmk_is_set(node->flags, crm_remote_node)) {
  83         // Pacemaker Remote nodes can't be partially alive
  84         return pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei) ? 1: -1;
  85 
  86     } else if (crm_is_peer_active(node)) {
  87         // Completely up cluster node: both cluster member and peer
  88         return 1;
  89 
  90     } else if (!pcmk_is_set(node->processes, crm_get_cluster_proc())
  91                && !pcmk__str_eq(node->state, CRM_NODE_MEMBER, pcmk__str_casei)) {
  92         // Completely down cluster node: neither cluster member nor peer
  93         return -1;
  94     }
  95 
  96     // Partially up cluster node: only cluster member or only peer
  97     return 0;
  98 }
  99 
 100 #define state_text(state) ((state)? (const char *)(state) : "in unknown state")
 101 
 102 void
 103 peer_update_callback(enum crm_status_type type, crm_node_t * node, const void *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 104 {
 105     uint32_t old = 0;
 106     bool appeared = FALSE;
 107     bool is_remote = pcmk_is_set(node->flags, crm_remote_node);
 108 
 109     /* The controller waits to receive some information from the membership
 110      * layer before declaring itself operational. If this is being called for a
 111      * cluster node, indicate that we have it.
 112      */
 113     if (!is_remote) {
 114         controld_set_fsa_input_flags(R_PEER_DATA);
 115     }
 116 
 117     if (node->uname == NULL) {
 118         return;
 119     }
 120 
 121     switch (type) {
 122         case crm_status_uname:
 123             /* If we've never seen the node, then it also won't be in the status section */
 124             crm_info("%s node %s is now %s",
 125                      (is_remote? "Remote" : "Cluster"),
 126                      node->uname, state_text(node->state));
 127             return;
 128 
 129         case crm_status_nstate:
 130             /* This callback should not be called unless the state actually
 131              * changed, but here's a failsafe just in case.
 132              */
 133             CRM_CHECK(!pcmk__str_eq(data, node->state, pcmk__str_casei),
 134                       return);
 135 
 136             crm_info("%s node %s is now %s (was %s)",
 137                      (is_remote? "Remote" : "Cluster"),
 138                      node->uname, state_text(node->state), state_text(data));
 139 
 140             if (pcmk__str_eq(CRM_NODE_MEMBER, node->state, pcmk__str_casei)) {
 141                 appeared = TRUE;
 142                 if (!is_remote) {
 143                     remove_stonith_cleanup(node->uname);
 144                 }
 145             } else {
 146                 controld_remove_voter(node->uname);
 147             }
 148 
 149             crmd_alert_node_event(node);
 150             break;
 151 
 152         case crm_status_processes:
 153             CRM_CHECK(data != NULL, return);
 154             old = *(const uint32_t *)data;
 155             appeared = pcmk_is_set(node->processes, crm_get_cluster_proc());
 156 
 157             crm_info("Node %s is %s a peer " CRM_XS " DC=%s old=0x%07x new=0x%07x",
 158                      node->uname, (appeared? "now" : "no longer"),
 159                      (AM_I_DC? "true" : (fsa_our_dc? fsa_our_dc : "<none>")),
 160                      old, node->processes);
 161 
 162             if (!pcmk_is_set((node->processes ^ old), crm_get_cluster_proc())) {
 163                 /* Peer status did not change. This should not be possible,
 164                  * since we don't track process flags other than peer status.
 165                  */
 166                 crm_trace("Process flag 0x%7x did not change from 0x%7x to 0x%7x",
 167                           crm_get_cluster_proc(), old, node->processes);
 168                 return;
 169 
 170             }
 171 
 172             if (!appeared) {
 173                 controld_remove_voter(node->uname);
 174             } else if (!AM_I_DC && !is_remote) {
 175                 /*
 176                  * This is a hack until we can send to a nodeid and/or we fix node name lookups
 177                  * These messages are ignored in crmd_ha_msg_filter()
 178                  */
 179                 xmlNode *query = create_request(CRM_OP_HELLO, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
 180 
 181                 crm_debug("Broadcasting our uname because of node %u", node->id);
 182                 send_cluster_message(node, crm_msg_crmd, query, FALSE);
 183 
 184                 free_xml(query);
 185             }
 186 
 187             if (!pcmk_is_set(fsa_input_register, R_CIB_CONNECTED)) {
 188                 crm_trace("Ignoring peer status change because not connected to CIB");
 189                 return;
 190 
 191             } else if (fsa_state == S_STOPPING) {
 192                 crm_trace("Ignoring peer status change because stopping");
 193                 return;
 194             }
 195 
 196             if (pcmk__str_eq(node->uname, fsa_our_uname, pcmk__str_casei) && !appeared) {
 197                 /* Did we get evicted? */
 198                 crm_notice("Our peer connection failed");
 199                 register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ERROR, NULL);
 200 
 201             } else if (pcmk__str_eq(node->uname, fsa_our_dc, pcmk__str_casei) && crm_is_peer_active(node) == FALSE) {
 202                 /* Did the DC leave us? */
 203                 crm_notice("Our peer on the DC (%s) is dead", fsa_our_dc);
 204                 register_fsa_input(C_CRMD_STATUS_CALLBACK, I_ELECTION, NULL);
 205 
 206                 /* @COMPAT DC < 1.1.13: If a DC shuts down normally, we don't
 207                  * want to fence it. Newer DCs will send their shutdown request
 208                  * to all peers, who will update the DC's expected state to
 209                  * down, thus avoiding fencing. We can safely erase the DC's
 210                  * transient attributes when it leaves in that case. However,
 211                  * the only way to avoid fencing older DCs is to leave the
 212                  * transient attributes intact until it rejoins.
 213                  */
 214                 if (compare_version(fsa_our_dc_version, "3.0.9") > 0) {
 215                     controld_delete_node_state(node->uname,
 216                                                controld_section_attrs,
 217                                                cib_scope_local);
 218                 }
 219 
 220             } else if (AM_I_DC || (fsa_our_dc == NULL)) {
 221                 /* This only needs to be done once, so normally the DC should do
 222                  * it. However if there is no DC, every node must do it, since
 223                  * there is no other way to ensure some one node does it.
 224                  */
 225                 if (appeared) {
 226                     te_trigger_stonith_history_sync(FALSE);
 227                 } else {
 228                     controld_delete_node_state(node->uname,
 229                                                controld_section_attrs,
 230                                                cib_scope_local);
 231                 }
 232             }
 233             break;
 234     }
 235 
 236     if (AM_I_DC) {
 237         xmlNode *update = NULL;
 238         int flags = node_update_peer;
 239         int alive = node_alive(node);
 240         crm_action_t *down = match_down_event(node->uuid);
 241 
 242         crm_trace("Alive=%d, appeared=%d, down=%d",
 243                   alive, appeared, (down? down->id : -1));
 244 
 245         if (appeared && (alive > 0) && !is_remote) {
 246             register_fsa_input_before(C_FSA_INTERNAL, I_NODE_JOIN, NULL);
 247         }
 248 
 249         if (down) {
 250             const char *task = crm_element_value(down->xml, XML_LRM_ATTR_TASK);
 251 
 252             if (pcmk__str_eq(task, CRM_OP_FENCE, pcmk__str_casei)) {
 253 
 254                 /* tengine_stonith_callback() confirms fence actions */
 255                 crm_trace("Updating CIB %s fencer reported fencing of %s complete",
 256                           (down->confirmed? "after" : "before"), node->uname);
 257 
 258             } else if (!appeared && pcmk__str_eq(task, CRM_OP_SHUTDOWN, pcmk__str_casei)) {
 259 
 260                 // Shutdown actions are immediately confirmed (i.e. no_wait)
 261                 if (!is_remote) {
 262                     flags |= node_update_join | node_update_expected;
 263                     crmd_peer_down(node, FALSE);
 264                     check_join_state(fsa_state, __func__);
 265                 }
 266                 if (alive >= 0) {
 267                     crm_info("%s of peer %s is in progress " CRM_XS " action=%d",
 268                              task, node->uname, down->id);
 269                 } else {
 270                     crm_notice("%s of peer %s is complete " CRM_XS " action=%d",
 271                                task, node->uname, down->id);
 272                     update_graph(transition_graph, down);
 273                     trigger_graph();
 274                 }
 275 
 276             } else {
 277                 crm_trace("Node %s is %s, was expected to %s (op %d)",
 278                           node->uname,
 279                           ((alive > 0)? "alive" :
 280                            ((alive < 0)? "dead" : "partially alive")),
 281                           task, down->id);
 282             }
 283 
 284         } else if (appeared == FALSE) {
 285             crm_warn("Stonith/shutdown of node %s was not expected",
 286                      node->uname);
 287             if (!is_remote) {
 288                 crm_update_peer_join(__func__, node, crm_join_none);
 289                 check_join_state(fsa_state, __func__);
 290             }
 291             abort_transition(INFINITY, tg_restart, "Node failure", NULL);
 292             fail_incompletable_actions(transition_graph, node->uuid);
 293 
 294         } else {
 295             crm_trace("Node %s came up, was not expected to be down",
 296                       node->uname);
 297         }
 298 
 299         if (is_remote) {
 300             /* A pacemaker_remote node won't have its cluster status updated
 301              * in the CIB by membership-layer callbacks, so do it here.
 302              */
 303             flags |= node_update_cluster;
 304 
 305             /* Trigger resource placement on newly integrated nodes */
 306             if (appeared) {
 307                 abort_transition(INFINITY, tg_restart,
 308                                  "pacemaker_remote node integrated", NULL);
 309             }
 310         }
 311 
 312         /* Update the CIB node state */
 313         update = create_node_state_update(node, flags, NULL, __func__);
 314         if (update == NULL) {
 315             crm_debug("Node state update not yet possible for %s", node->uname);
 316         } else {
 317             fsa_cib_anon_update(XML_CIB_TAG_STATUS, update);
 318         }
 319         free_xml(update);
 320     }
 321 
 322     trigger_fsa();
 323 }
 324 
 325 void
 326 crmd_cib_connection_destroy(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 327 {
 328     CRM_CHECK(user_data == fsa_cib_conn,;);
 329 
 330     crm_trace("Invoked");
 331     trigger_fsa();
 332     fsa_cib_conn->state = cib_disconnected;
 333 
 334     if (!pcmk_is_set(fsa_input_register, R_CIB_CONNECTED)) {
 335         crm_info("Connection to the CIB manager terminated");
 336         return;
 337     }
 338 
 339     // @TODO This should trigger a reconnect, not a shutdown
 340     crm_crit("Lost connection to the CIB manager, shutting down");
 341     register_fsa_input(C_FSA_INTERNAL, I_ERROR, NULL);
 342     controld_clear_fsa_input_flags(R_CIB_CONNECTED);
 343 
 344     return;
 345 }
 346 
 347 gboolean
 348 crm_fsa_trigger(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 349 {
 350     crm_trace("Invoked (queue len: %d)", g_list_length(fsa_message_queue));
 351     s_crmd_fsa(C_FSA_INTERNAL);
 352     crm_trace("Exited  (queue len: %d)", g_list_length(fsa_message_queue));
 353     return TRUE;
 354 }

/* [previous][next][first][last][top][bottom][index][help] */