root/crmd/heartbeat.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. ccm_connection_destroy
  2. do_ccm_control
  3. ccm_event_detail
  4. do_ccm_update_cache
  5. ccm_dispatch
  6. crmd_ccm_msg_callback
  7. crmd_ha_status_callback
  8. crmd_client_status_callback
  9. crmd_ha_msg_callback
  10. crmd_ha_msg_dispatch

   1 /* 
   2  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
   3  * 
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  * 
   9  * This software is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  * 
  14  * You should have received a copy of the GNU General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 
  19 /* put these first so that uuid_t is defined without conflicts */
  20 #include <crm_internal.h>
  21 
  22 #include <string.h>
  23 
  24 #include <crm/crm.h>
  25 #include <crm/cib.h>
  26 #include <crm/msg_xml.h>
  27 #include <crm/common/xml.h>
  28 #include <crm/cluster.h>
  29 #include <crmd_messages.h>
  30 #include <crmd_fsa.h>
  31 #include <fsa_proto.h>
  32 #include <crmd_callbacks.h>
  33 #include <tengine.h>
  34 #include <membership.h>
  35 
  36 #include <ocf/oc_event.h>
  37 #include <ocf/oc_membership.h>
  38 
  39 void oc_ev_special(const oc_ev_t *, oc_ev_class_t, int);
  40 void ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event);
  41 gboolean crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data);
  42 void crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data);
  43 int ccm_dispatch(gpointer user_data);
  44 
  45 #define CCM_EVENT_DETAIL 0
  46 #define CCM_EVENT_DETAIL_PARTIAL 0
  47 
  48 int (*ccm_api_callback_done) (void *cookie) = NULL;
  49 int (*ccm_api_handle_event) (const oc_ev_t * token) = NULL;
  50 
  51 static oc_ev_t *fsa_ev_token;
  52 static void *ccm_library = NULL;
  53 static int num_ccm_register_fails = 0;
  54 static int max_ccm_register_fails = 30;
  55 
  56 static void
  57 ccm_connection_destroy(void *userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
  58 {
  59 }
  60 
  61 /*       A_CCM_CONNECT  */
  62 void
  63 do_ccm_control(long long action,
     /* [previous][next][first][last][top][bottom][index][help] */
  64                enum crmd_fsa_cause cause,
  65                enum crmd_fsa_state cur_state,
  66                enum crmd_fsa_input current_input, fsa_data_t * msg_data)
  67 {
  68     static struct mainloop_fd_callbacks ccm_fd_callbacks = {
  69         .dispatch = ccm_dispatch,
  70         .destroy = ccm_connection_destroy,
  71     };
  72 
  73     if (is_heartbeat_cluster()) {
  74         int (*ccm_api_register) (oc_ev_t ** token) =
  75             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_register", 1);
  76 
  77         int (*ccm_api_set_callback) (const oc_ev_t * token,
  78                                      oc_ev_class_t class,
  79                                      oc_ev_callback_t * fn,
  80                                      oc_ev_callback_t ** prev_fn) =
  81             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_set_callback", 1);
  82 
  83         void (*ccm_api_special) (const oc_ev_t *, oc_ev_class_t, int) =
  84             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_special", 1);
  85         int (*ccm_api_activate) (const oc_ev_t * token, int *fd) =
  86             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_activate", 1);
  87         int (*ccm_api_unregister) (oc_ev_t * token) =
  88             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_unregister", 1);
  89 
  90         if (action & A_CCM_DISCONNECT) {
  91             set_bit(fsa_input_register, R_CCM_DISCONNECTED);
  92             (*ccm_api_unregister) (fsa_ev_token);
  93         }
  94 
  95         if (action & A_CCM_CONNECT) {
  96             int ret;
  97             int fsa_ev_fd;
  98             gboolean did_fail = FALSE;
  99 
 100             crm_trace("Registering with CCM");
 101             clear_bit(fsa_input_register, R_CCM_DISCONNECTED);
 102             ret = (*ccm_api_register) (&fsa_ev_token);
 103             if (ret != 0) {
 104                 crm_warn("CCM registration failed");
 105                 did_fail = TRUE;
 106             }
 107 
 108             if (did_fail == FALSE) {
 109                 crm_trace("Setting up CCM callbacks");
 110                 ret = (*ccm_api_set_callback) (fsa_ev_token, OC_EV_MEMB_CLASS,
 111                                                crmd_ccm_msg_callback, NULL);
 112                 if (ret != 0) {
 113                     crm_warn("CCM callback not set");
 114                     did_fail = TRUE;
 115                 }
 116             }
 117             if (did_fail == FALSE) {
 118                 (*ccm_api_special) (fsa_ev_token, OC_EV_MEMB_CLASS, 0 /*don't care */ );
 119 
 120                 crm_trace("Activating CCM token");
 121                 ret = (*ccm_api_activate) (fsa_ev_token, &fsa_ev_fd);
 122                 if (ret != 0) {
 123                     crm_warn("CCM Activation failed");
 124                     did_fail = TRUE;
 125                 }
 126             }
 127 
 128             if (did_fail) {
 129                 num_ccm_register_fails++;
 130                 (*ccm_api_unregister) (fsa_ev_token);
 131 
 132                 if (num_ccm_register_fails < max_ccm_register_fails) {
 133                     crm_warn("CCM Connection failed"
 134                              " %d times (%d max)", num_ccm_register_fails, max_ccm_register_fails);
 135 
 136                     crm_timer_start(wait_timer);
 137                     crmd_fsa_stall(FALSE);
 138                     return;
 139 
 140                 } else {
 141                     crm_err("CCM Activation failed %d (max) times", num_ccm_register_fails);
 142                     register_fsa_error(C_FSA_INTERNAL, I_FAIL, NULL);
 143                     return;
 144                 }
 145             }
 146 
 147             crm_info("CCM connection established... waiting for first callback");
 148             mainloop_add_fd("heartbeat-ccm", G_PRIORITY_HIGH, fsa_ev_fd, fsa_ev_token,
 149                             &ccm_fd_callbacks);
 150 
 151         }
 152     }
 153 
 154     if (action & ~(A_CCM_CONNECT | A_CCM_DISCONNECT)) {
 155         crm_err("Unexpected action %s in %s", fsa_action2string(action), __FUNCTION__);
 156     }
 157 }
 158 
 159 void
 160 ccm_event_detail(const oc_ev_membership_t * oc, oc_ed_t event)
     /* [previous][next][first][last][top][bottom][index][help] */
 161 {
 162     int lpc;
 163     gboolean member = FALSE;
 164 
 165     member = FALSE;
 166 
 167     crm_trace("-----------------------");
 168     crm_info("%s: trans=%d, nodes=%d, new=%d, lost=%d n_idx=%d, "
 169              "new_idx=%d, old_idx=%d",
 170              ccm_event_name(event),
 171              oc->m_instance,
 172              oc->m_n_member, oc->m_n_in, oc->m_n_out, oc->m_memb_idx, oc->m_in_idx, oc->m_out_idx);
 173 
 174 #if !CCM_EVENT_DETAIL_PARTIAL
 175     for (lpc = 0; lpc < oc->m_n_member; lpc++) {
 176         crm_info("\tCURRENT: %s [nodeid=%d, born=%d]",
 177                  oc->m_array[oc->m_memb_idx + lpc].node_uname,
 178                  oc->m_array[oc->m_memb_idx + lpc].node_id,
 179                  oc->m_array[oc->m_memb_idx + lpc].node_born_on);
 180 
 181         if (safe_str_eq(fsa_our_uname, oc->m_array[oc->m_memb_idx + lpc].node_uname)) {
 182             member = TRUE;
 183         }
 184     }
 185     if (member == FALSE) {
 186         crm_warn("MY NODE IS NOT IN CCM THE MEMBERSHIP LIST");
 187     }
 188 #endif
 189     for (lpc = 0; lpc < (int)oc->m_n_in; lpc++) {
 190         crm_info("\tNEW:     %s [nodeid=%d, born=%d]",
 191                  oc->m_array[oc->m_in_idx + lpc].node_uname,
 192                  oc->m_array[oc->m_in_idx + lpc].node_id,
 193                  oc->m_array[oc->m_in_idx + lpc].node_born_on);
 194     }
 195 
 196     for (lpc = 0; lpc < (int)oc->m_n_out; lpc++) {
 197         crm_info("\tLOST:    %s [nodeid=%d, born=%d]",
 198                  oc->m_array[oc->m_out_idx + lpc].node_uname,
 199                  oc->m_array[oc->m_out_idx + lpc].node_id,
 200                  oc->m_array[oc->m_out_idx + lpc].node_born_on);
 201     }
 202 
 203     crm_trace("-----------------------");
 204 
 205 }
 206 
 207 /*       A_CCM_UPDATE_CACHE     */
 208 /*
 209  * Take the opportunity to update the node status in the CIB as well
 210  */
 211 void
 212 do_ccm_update_cache(enum crmd_fsa_cause cause, enum crmd_fsa_state cur_state,
     /* [previous][next][first][last][top][bottom][index][help] */
 213                     oc_ed_t event, const oc_ev_membership_t * oc, xmlNode * xml)
 214 {
 215     unsigned long long instance = 0;
 216     unsigned int lpc = 0;
 217 
 218     if (is_heartbeat_cluster()) {
 219         CRM_ASSERT(oc != NULL);
 220         instance = oc->m_instance;
 221     }
 222 
 223     CRM_ASSERT(crm_peer_seq <= instance);
 224 
 225     switch (cur_state) {
 226         case S_STOPPING:
 227         case S_TERMINATE:
 228         case S_HALT:
 229             crm_debug("Ignoring %s CCM event %llu, we're in state %s",
 230                       ccm_event_name(event), instance, fsa_state2string(cur_state));
 231             return;
 232         case S_ELECTION:
 233             register_fsa_action(A_ELECTION_CHECK);
 234             break;
 235         default:
 236             break;
 237     }
 238 
 239     if (is_heartbeat_cluster()) {
 240         ccm_event_detail(oc, event);
 241 
 242         /*--*-- Recently Dead Member Nodes --*--*/
 243         for (lpc = 0; lpc < oc->m_n_out; lpc++) {
 244             crm_update_ccm_node(oc, lpc + oc->m_out_idx, CRM_NODE_LOST, instance);
 245         }
 246 
 247             /*--*-- All Member Nodes --*--*/
 248         for (lpc = 0; lpc < oc->m_n_member; lpc++) {
 249             crm_update_ccm_node(oc, lpc + oc->m_memb_idx, CRM_NODE_MEMBER, instance);
 250         }
 251         heartbeat_cluster->llc_ops->client_status(heartbeat_cluster, NULL, crm_system_name, 0);
 252     }
 253 
 254     if (event == OC_EV_MS_EVICTED) {
 255         crm_node_t *peer = crm_get_peer(0, fsa_our_uname);
 256 
 257         crm_update_peer_state(__FUNCTION__, peer, CRM_NODE_EVICTED, 0);
 258 
 259         /* todo: drop back to S_PENDING instead */
 260         /* get out... NOW!
 261          *
 262          * go via the error recovery process so that HA will
 263          *    restart us if required
 264          */
 265         register_fsa_error_adv(cause, I_ERROR, NULL, NULL, __FUNCTION__);
 266     }
 267 
 268     post_cache_update(instance);
 269     return;
 270 }
 271 
 272 int
 273 ccm_dispatch(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 274 {
 275     int rc = 0;
 276     oc_ev_t *ccm_token = (oc_ev_t *) user_data;
 277     gboolean was_error = FALSE;
 278 
 279     crm_trace("Invoked");
 280     if (ccm_api_handle_event == NULL) {
 281         ccm_api_handle_event =
 282             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_handle_event", 1);
 283     }
 284     rc = (*ccm_api_handle_event) (ccm_token);
 285 
 286     if (rc != 0) {
 287         if (is_set(fsa_input_register, R_CCM_DISCONNECTED) == FALSE) {
 288             /* we signed out, so this is expected */
 289             register_fsa_input(C_CCM_CALLBACK, I_ERROR, NULL);
 290             crm_err("CCM connection appears to have failed: rc=%d.", rc);
 291         }
 292         was_error = TRUE;
 293     }
 294 
 295     trigger_fsa(fsa_source);
 296     if (was_error) {
 297         return -1;
 298     }
 299 
 300     return 0;
 301 }
 302 
 303 void
 304 crmd_ccm_msg_callback(oc_ed_t event, void *cookie, size_t size, const void *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 305 {
 306     gboolean update_cache = FALSE;
 307     const oc_ev_membership_t *membership = data;
 308 
 309     gboolean update_quorum = FALSE;
 310 
 311     crm_trace("Invoked");
 312     CRM_ASSERT(data != NULL);
 313 
 314     crm_info("Quorum %s after event=%s (id=%d)",
 315              ccm_have_quorum(event) ? "(re)attained" : "lost",
 316              ccm_event_name(event), membership->m_instance);
 317 
 318     if (crm_peer_seq > membership->m_instance) {
 319         crm_err("Membership instance ID went backwards! %llu->%d",
 320                 crm_peer_seq, membership->m_instance);
 321         CRM_ASSERT(crm_peer_seq <= membership->m_instance);
 322         return;
 323     }
 324 
 325     /*
 326      * OC_EV_MS_NEW_MEMBERSHIP:   membership with quorum
 327      * OC_EV_MS_MS_INVALID:       membership without quorum
 328      * OC_EV_MS_NOT_PRIMARY:      previous membership no longer valid
 329      * OC_EV_MS_PRIMARY_RESTORED: previous membership restored
 330      * OC_EV_MS_EVICTED:          the client is evicted from ccm.
 331      */
 332 
 333     switch (event) {
 334         case OC_EV_MS_NEW_MEMBERSHIP:
 335         case OC_EV_MS_INVALID:
 336             update_cache = TRUE;
 337             update_quorum = TRUE;
 338             break;
 339         case OC_EV_MS_NOT_PRIMARY:
 340             break;
 341         case OC_EV_MS_PRIMARY_RESTORED:
 342             update_cache = TRUE;
 343             crm_peer_seq = membership->m_instance;
 344             break;
 345         case OC_EV_MS_EVICTED:
 346             update_quorum = TRUE;
 347             register_fsa_input(C_FSA_INTERNAL, I_STOP, NULL);
 348             crm_err("Shutting down after CCM event: %s", ccm_event_name(event));
 349             break;
 350         default:
 351             crm_err("Unknown CCM event: %d", event);
 352     }
 353 
 354     if (update_quorum) {
 355         crm_have_quorum = ccm_have_quorum(event);
 356         if (crm_have_quorum == FALSE) {
 357             /* did we just lose quorum? */
 358             if (fsa_has_quorum) {
 359                 crm_info("Quorum lost: %s", ccm_event_name(event));
 360             }
 361         }
 362         crm_update_quorum(crm_have_quorum, FALSE);
 363     }
 364 
 365     if (update_cache) {
 366         crm_trace("Updating cache after event %s", ccm_event_name(event));
 367         do_ccm_update_cache(C_CCM_CALLBACK, fsa_state, event, data, NULL);
 368 
 369     } else if (event != OC_EV_MS_NOT_PRIMARY) {
 370         crm_peer_seq = membership->m_instance;
 371         register_fsa_action(A_TE_CANCEL);
 372     }
 373 
 374     if (ccm_api_callback_done == NULL) {
 375         ccm_api_callback_done =
 376             find_library_function(&ccm_library, CCM_LIBRARY, "oc_ev_callback_done", 1);
 377     }
 378     (*ccm_api_callback_done) (cookie);
 379     return;
 380 }
 381 
 382 void
 383 crmd_ha_status_callback(const char *node, const char *status, void *private)
     /* [previous][next][first][last][top][bottom][index][help] */
 384 {
 385     xmlNode *update = NULL;
 386     crm_node_t *peer = NULL;
 387 
 388     crm_notice("Status update: Node %s now has status [%s]", node, status);
 389 
 390     peer = crm_get_peer(0, node);
 391     if (safe_str_eq(status, PINGSTATUS)) {
 392         return;
 393     }
 394 
 395     if (safe_str_eq(status, DEADSTATUS)) {
 396         /* this node is toast */
 397         crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd|crm_proc_heartbeat, OFFLINESTATUS);
 398 
 399     } else {
 400         crm_update_peer_proc(__FUNCTION__, peer, crm_proc_heartbeat, ONLINESTATUS);
 401     }
 402 
 403     trigger_fsa(fsa_source);
 404 
 405     if (AM_I_DC) {
 406         update = create_node_state_update(peer, node_update_cluster, NULL,
 407                                           __FUNCTION__);
 408         fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
 409                             cib_scope_local | cib_quorum_override | cib_can_create);
 410         free_xml(update);
 411     }
 412 }
 413 
 414 void
 415 crmd_client_status_callback(const char *node, const char *client, const char *status, void *private)
     /* [previous][next][first][last][top][bottom][index][help] */
 416 {
 417     crm_node_t *peer = NULL;
 418 
 419     crm_trace("Invoked");
 420     if (safe_str_neq(client, CRM_SYSTEM_CRMD)) {
 421         return;
 422     }
 423 
 424     peer = crm_get_peer(0, node);
 425 
 426     if (safe_str_neq(peer->state, CRM_NODE_MEMBER)) {
 427         crm_warn("This peer is not a ccm member (yet). "
 428             "Status ignored: Client %s/%s announced status [%s] (DC=%s)",
 429             node, client, status, AM_I_DC ? "true" : "false");
 430         return;
 431     }
 432 
 433     set_bit(fsa_input_register, R_PEER_DATA);
 434 
 435     crm_notice("Status update: Client %s/%s now has status [%s] (DC=%s)",
 436                node, client, status, AM_I_DC ? "true" : "false");
 437 
 438     /* rest of the code, especially crm_update_peer_proc,
 439      * does not know about JOINSTATUS, but expects ONLINESTATUS.
 440      * See also cib/callbacks.c */
 441     if (safe_str_eq(status, JOINSTATUS)) {
 442         status = ONLINESTATUS;
 443     }  else if (safe_str_eq(status, LEAVESTATUS)) {
 444         status = OFFLINESTATUS;
 445     }
 446 
 447     if (safe_str_eq(status, ONLINESTATUS)) {
 448         /* remove the cached value in case it changed */
 449         crm_trace("Uncaching UUID for %s", node);
 450         free(peer->uuid);
 451         peer->uuid = NULL;
 452     }
 453 
 454     crm_update_peer_proc(__FUNCTION__, peer, crm_proc_crmd, status);
 455 
 456     if (AM_I_DC) {
 457         xmlNode *update = NULL;
 458 
 459         crm_trace("Got client status callback");
 460         update = create_node_state_update(peer, node_update_peer, NULL,
 461                                           __FUNCTION__);
 462         fsa_cib_anon_update(XML_CIB_TAG_STATUS, update,
 463                             cib_scope_local | cib_quorum_override | cib_can_create);
 464         free_xml(update);
 465     }
 466 }
 467 
 468 void
 469 crmd_ha_msg_callback(HA_Message * hamsg, void *private_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 470 {
 471     int level = LOG_DEBUG;
 472     crm_node_t *from_node = NULL;
 473 
 474     xmlNode *msg = convert_ha_message(NULL, hamsg, __FUNCTION__);
 475     const char *from = crm_element_value(msg, F_ORIG);
 476     const char *op = crm_element_value(msg, F_CRM_TASK);
 477     const char *sys_from = crm_element_value(msg, F_CRM_SYS_FROM);
 478 
 479     CRM_CHECK(from != NULL, crm_log_xml_err(msg, "anon"); goto bail);
 480 
 481     crm_trace("HA[inbound]: %s from %s", op, from);
 482 
 483     if (crm_peer_cache == NULL || crm_active_peers() == 0) {
 484         crm_debug("Ignoring HA messages until we are"
 485                   " connected to the CCM (%s op from %s)", op, from);
 486         crm_log_xml_trace(msg, "HA[inbound]: Ignore (No CCM)");
 487         goto bail;
 488     }
 489 
 490     from_node = crm_get_peer(0, from);
 491     if (crm_is_peer_active(from_node) == FALSE) {
 492         if (safe_str_eq(op, CRM_OP_VOTE)) {
 493             level = LOG_WARNING;
 494 
 495         } else if (AM_I_DC && safe_str_eq(op, CRM_OP_JOIN_ANNOUNCE)) {
 496             level = LOG_WARNING;
 497 
 498         } else if (safe_str_eq(sys_from, CRM_SYSTEM_DC)) {
 499             level = LOG_WARNING;
 500         }
 501         do_crm_log(level,
 502                    "Ignoring HA message (op=%s) from %s: not in our"
 503                    " membership list (size=%d)", op, from, crm_active_peers());
 504 
 505         crm_log_xml_trace(msg, "HA[inbound]: CCM Discard");
 506 
 507     } else {
 508         crmd_ha_msg_filter(msg);
 509     }
 510 
 511   bail:
 512     free_xml(msg);
 513     return;
 514 }
 515 
 516 gboolean
 517 crmd_ha_msg_dispatch(ll_cluster_t * cluster_conn, gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 518 {
 519     IPC_Channel *channel = NULL;
 520     gboolean stay_connected = TRUE;
 521 
 522     crm_trace("Invoked");
 523 
 524     if (cluster_conn != NULL) {
 525         channel = cluster_conn->llc_ops->ipcchan(cluster_conn);
 526     }
 527 
 528     CRM_CHECK(cluster_conn != NULL,;);
 529     CRM_CHECK(channel != NULL,;);
 530 
 531     if (channel != NULL && IPC_ISRCONN(channel)) {
 532         if (cluster_conn->llc_ops->msgready(cluster_conn) == 0) {
 533             crm_trace("no message ready yet");
 534         }
 535         /* invoke the callbacks but don't block */
 536         cluster_conn->llc_ops->rcvmsg(cluster_conn, 0);
 537     }
 538 
 539     if (channel == NULL || channel->ch_status != IPC_CONNECT) {
 540         if (is_set(fsa_input_register, R_HA_DISCONNECTED) == FALSE) {
 541             crm_crit("Lost connection to heartbeat service.");
 542         } else {
 543             crm_info("Lost connection to heartbeat service.");
 544         }
 545         trigger_fsa(fsa_source);
 546         stay_connected = FALSE;
 547     }
 548 
 549     return stay_connected;
 550 }

/* [previous][next][first][last][top][bottom][index][help] */