root/lib/cluster/corosync.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. corosync_node_name
  2. terminate_cs_connection
  3. pcmk_quorum_dispatch
  4. pcmk_quorum_notification
  5. cluster_connect_quorum
  6. init_cs_connection
  7. init_cs_connection_once
  8. check_message_sanity
  9. find_corosync_variant
  10. crm_is_corosync_peer_active
  11. corosync_initialize_nodelist
  12. corosync_cluster_name
  13. corosync_cmap_has_config

   1 /*
   2  * Copyright (C) 2004 Andrew Beekhof <andrew@beekhof.net>
   3  *
   4  * This library is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU Lesser General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2.1 of the License, or (at your option) any later version.
   8  *
   9  * This library is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * Lesser General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU Lesser General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 
  19 #include <crm_internal.h>
  20 #include <bzlib.h>
  21 #include <sys/socket.h>
  22 #include <netinet/in.h>
  23 #include <arpa/inet.h>
  24 #include <netdb.h>
  25 
  26 #include <crm/common/ipc.h>
  27 #include <crm/cluster/internal.h>
  28 #include <crm/common/mainloop.h>
  29 #include <sys/utsname.h>
  30 
  31 #include <qb/qbipcc.h>
  32 #include <qb/qbutil.h>
  33 
  34 #include <corosync/corodefs.h>
  35 #include <corosync/corotypes.h>
  36 #include <corosync/hdb.h>
  37 #include <corosync/cfg.h>
  38 #include <corosync/cmap.h>
  39 #include <corosync/quorum.h>
  40 
  41 #include <crm/msg_xml.h>
  42 
  43 quorum_handle_t pcmk_quorum_handle = 0;
  44 
  45 gboolean(*quorum_app_callback) (unsigned long long seq, gboolean quorate) = NULL;
  46 
  47 /*
  48  * CFG functionality stolen from node_name() in corosync-quorumtool.c
  49  * This resolves the first address assigned to a node and returns the name or IP address.
  50  */
  51 char *
  52 corosync_node_name(uint64_t /*cmap_handle_t */ cmap_handle, uint32_t nodeid)
     /* [previous][next][first][last][top][bottom][index][help] */
  53 {
  54     int lpc = 0;
  55     int rc = CS_OK;
  56     int retries = 0;
  57     char *name = NULL;
  58     cmap_handle_t local_handle = 0;
  59 
  60     /* nodeid == 0 == CMAN_NODEID_US */
  61     if (nodeid == 0) {
  62         nodeid = get_local_nodeid(0);
  63     }
  64 
  65     if (cmap_handle == 0 && local_handle == 0) {
  66         retries = 0;
  67         crm_trace("Initializing CMAP connection");
  68         do {
  69             rc = cmap_initialize(&local_handle);
  70             if (rc != CS_OK) {
  71                 retries++;
  72                 crm_debug("API connection setup failed: %s.  Retrying in %ds", cs_strerror(rc),
  73                           retries);
  74                 sleep(retries);
  75             }
  76 
  77         } while (retries < 5 && rc != CS_OK);
  78 
  79         if (rc != CS_OK) {
  80             crm_warn("Could not connect to Cluster Configuration Database API, error %s",
  81                      cs_strerror(rc));
  82             local_handle = 0;
  83         }
  84     }
  85 
  86     if (cmap_handle == 0) {
  87         cmap_handle = local_handle;
  88     }
  89 
  90     while (name == NULL && cmap_handle != 0) {
  91         uint32_t id = 0;
  92         char *key = NULL;
  93 
  94         key = crm_strdup_printf("nodelist.node.%d.nodeid", lpc);
  95         rc = cmap_get_uint32(cmap_handle, key, &id);
  96         crm_trace("Checking %u vs %u from %s", nodeid, id, key);
  97         free(key);
  98 
  99         if (rc != CS_OK) {
 100             break;
 101         }
 102 
 103         if (nodeid == id) {
 104             crm_trace("Searching for node name for %u in nodelist.node.%d %s", nodeid, lpc, name);
 105             if (name == NULL) {
 106                 key = crm_strdup_printf("nodelist.node.%d.ring0_addr", lpc);
 107                 cmap_get_string(cmap_handle, key, &name);
 108                 crm_trace("%s = %s", key, name);
 109 
 110                 if (node_name_is_valid(key, name) == FALSE) {
 111                     free(name);
 112                     name = NULL;
 113                 }
 114                 free(key);
 115             }
 116 
 117             if (name == NULL) {
 118                 key = crm_strdup_printf("nodelist.node.%d.name", lpc);
 119                 cmap_get_string(cmap_handle, key, &name);
 120                 crm_trace("%s = %s %d", key, name, rc);
 121                 free(key);
 122             }
 123             break;
 124         }
 125 
 126         lpc++;
 127     }
 128 
 129     if(local_handle) {
 130         cmap_finalize(local_handle);
 131     }
 132 
 133     if (name == NULL) {
 134         crm_info("Unable to get node name for nodeid %u", nodeid);
 135     }
 136     return name;
 137 }
 138 
 139 void
 140 terminate_cs_connection(crm_cluster_t *cluster)
     /* [previous][next][first][last][top][bottom][index][help] */
 141 {
 142     crm_info("Disconnecting from Corosync");
 143 
 144     cluster_disconnect_cpg(cluster);
 145 
 146     if (pcmk_quorum_handle) {
 147         crm_trace("Disconnecting quorum");
 148         quorum_finalize(pcmk_quorum_handle);
 149         pcmk_quorum_handle = 0;
 150 
 151     } else {
 152         crm_info("No Quorum connection");
 153     }
 154 
 155     crm_notice("Disconnected from Corosync");
 156 }
 157 
 158 int ais_membership_timer = 0;
 159 gboolean ais_membership_force = FALSE;
 160 
 161 
 162 static int
 163 pcmk_quorum_dispatch(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 164 {
 165     int rc = 0;
 166 
 167     rc = quorum_dispatch(pcmk_quorum_handle, CS_DISPATCH_ALL);
 168     if (rc < 0) {
 169         crm_err("Connection to the Quorum API failed: %d", rc);
 170         pcmk_quorum_handle = 0;
 171         return -1;
 172     }
 173     return 0;
 174 }
 175 
 176 static void
 177 pcmk_quorum_notification(quorum_handle_t handle,
     /* [previous][next][first][last][top][bottom][index][help] */
 178                          uint32_t quorate,
 179                          uint64_t ring_id, uint32_t view_list_entries, uint32_t * view_list)
 180 {
 181     int i;
 182     GHashTableIter iter;
 183     crm_node_t *node = NULL;
 184     static gboolean init_phase = TRUE;
 185 
 186     if (quorate != crm_have_quorum) {
 187         if (quorate) {
 188             crm_notice("Quorum acquired " CRM_XS " membership=" U64T " members=%lu",
 189                        ring_id, (long unsigned int)view_list_entries);
 190         } else {
 191             crm_warn("Quorum lost " CRM_XS " membership=" U64T " members=%lu",
 192                      ring_id, (long unsigned int)view_list_entries);
 193         }
 194         crm_have_quorum = quorate;
 195 
 196     } else {
 197         crm_info("Quorum %s " CRM_XS " membership=" U64T " members=%lu",
 198                  (quorate? "retained" : "still lost"), ring_id,
 199                  (long unsigned int)view_list_entries);
 200     }
 201 
 202     if (view_list_entries == 0 && init_phase) {
 203         crm_info("Corosync membership is still forming, ignoring");
 204         return;
 205     }
 206 
 207     init_phase = FALSE;
 208 
 209     /* Reset last_seen for all cached nodes so we can tell which ones aren't
 210      * in the view list */
 211     g_hash_table_iter_init(&iter, crm_peer_cache);
 212     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 213         node->last_seen = 0;
 214     }
 215 
 216     /* Update the peer cache for each node in view list */
 217     for (i = 0; i < view_list_entries; i++) {
 218         uint32_t id = view_list[i];
 219 
 220         crm_debug("Member[%d] %u ", i, id);
 221 
 222         /* Get this node's peer cache entry (adding one if not already there) */
 223         node = crm_get_peer(id, NULL);
 224         if (node->uname == NULL) {
 225             char *name = corosync_node_name(0, id);
 226 
 227             crm_info("Obtaining name for new node %u", id);
 228             node = crm_get_peer(id, name);
 229             free(name);
 230         }
 231 
 232         /* Update the node state (including updating last_seen to ring_id) */
 233         crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, ring_id);
 234     }
 235 
 236     /* Remove any peer cache entries we didn't update */
 237     crm_reap_unseen_nodes(ring_id);
 238 
 239     if (quorum_app_callback) {
 240         quorum_app_callback(ring_id, quorate);
 241     }
 242 }
 243 
 244 quorum_callbacks_t quorum_callbacks = {
 245     .quorum_notify_fn = pcmk_quorum_notification,
 246 };
 247 
 248 gboolean
 249 cluster_connect_quorum(gboolean(*dispatch) (unsigned long long, gboolean),
     /* [previous][next][first][last][top][bottom][index][help] */
 250                        void (*destroy) (gpointer))
 251 {
 252     int rc = -1;
 253     int fd = 0;
 254     int quorate = 0;
 255     uint32_t quorum_type = 0;
 256     struct mainloop_fd_callbacks quorum_fd_callbacks;
 257 
 258     quorum_fd_callbacks.dispatch = pcmk_quorum_dispatch;
 259     quorum_fd_callbacks.destroy = destroy;
 260 
 261     crm_debug("Configuring Pacemaker to obtain quorum from Corosync");
 262 
 263     rc = quorum_initialize(&pcmk_quorum_handle, &quorum_callbacks, &quorum_type);
 264     if (rc != CS_OK) {
 265         crm_err("Could not connect to the Quorum API: %d", rc);
 266         goto bail;
 267 
 268     } else if (quorum_type != QUORUM_SET) {
 269         crm_err("Corosync quorum is not configured");
 270         goto bail;
 271     }
 272 
 273     rc = quorum_getquorate(pcmk_quorum_handle, &quorate);
 274     if (rc != CS_OK) {
 275         crm_err("Could not obtain the current Quorum API state: %d", rc);
 276         goto bail;
 277     }
 278 
 279     if (quorate) {
 280         crm_notice("Quorum acquired");
 281     } else {
 282         crm_warn("Quorum lost");
 283     }
 284     quorum_app_callback = dispatch;
 285     crm_have_quorum = quorate;
 286 
 287     rc = quorum_trackstart(pcmk_quorum_handle, CS_TRACK_CHANGES | CS_TRACK_CURRENT);
 288     if (rc != CS_OK) {
 289         crm_err("Could not setup Quorum API notifications: %d", rc);
 290         goto bail;
 291     }
 292 
 293     rc = quorum_fd_get(pcmk_quorum_handle, &fd);
 294     if (rc != CS_OK) {
 295         crm_err("Could not obtain the Quorum API connection: %d", rc);
 296         goto bail;
 297     }
 298 
 299     mainloop_add_fd("quorum", G_PRIORITY_HIGH, fd, dispatch, &quorum_fd_callbacks);
 300 
 301     corosync_initialize_nodelist(NULL, FALSE, NULL);
 302 
 303   bail:
 304     if (rc != CS_OK) {
 305         quorum_finalize(pcmk_quorum_handle);
 306         return FALSE;
 307     }
 308     return TRUE;
 309 }
 310 
 311 gboolean
 312 init_cs_connection(crm_cluster_t * cluster)
     /* [previous][next][first][last][top][bottom][index][help] */
 313 {
 314     int retries = 0;
 315 
 316     while (retries < 5) {
 317         int rc = init_cs_connection_once(cluster);
 318 
 319         retries++;
 320 
 321         switch (rc) {
 322             case CS_OK:
 323                 return TRUE;
 324                 break;
 325             case CS_ERR_TRY_AGAIN:
 326             case CS_ERR_QUEUE_FULL:
 327                 sleep(retries);
 328                 break;
 329             default:
 330                 return FALSE;
 331         }
 332     }
 333 
 334     crm_err("Could not connect to corosync after %d retries", retries);
 335     return FALSE;
 336 }
 337 
 338 gboolean
 339 init_cs_connection_once(crm_cluster_t * cluster)
     /* [previous][next][first][last][top][bottom][index][help] */
 340 {
 341     crm_node_t *peer = NULL;
 342     enum cluster_type_e stack = get_cluster_type();
 343 
 344     crm_peer_init();
 345 
 346     /* Here we just initialize comms */
 347     if (stack != pcmk_cluster_corosync) {
 348         crm_err("Invalid cluster type: %s (%d)", name_for_cluster_type(stack), stack);
 349         return FALSE;
 350     }
 351 
 352     if (cluster_connect_cpg(cluster) == FALSE) {
 353         return FALSE;
 354     }
 355     crm_info("Connection to '%s': established", name_for_cluster_type(stack));
 356 
 357     cluster->nodeid = get_local_nodeid(0);
 358     if(cluster->nodeid == 0) {
 359         crm_err("Could not establish local nodeid");
 360         return FALSE;
 361     }
 362 
 363     cluster->uname = get_node_name(0);
 364     if(cluster->uname == NULL) {
 365         crm_err("Could not establish local node name");
 366         return FALSE;
 367     }
 368 
 369     /* Ensure the local node always exists */
 370     peer = crm_get_peer(cluster->nodeid, cluster->uname);
 371     cluster->uuid = get_corosync_uuid(peer);
 372 
 373     return TRUE;
 374 }
 375 
 376 gboolean
 377 check_message_sanity(const AIS_Message * msg, const char *data)
     /* [previous][next][first][last][top][bottom][index][help] */
 378 {
 379     gboolean sane = TRUE;
 380     int dest = msg->host.type;
 381     int tmp_size = msg->header.size - sizeof(AIS_Message);
 382 
 383     if (sane && msg->header.size == 0) {
 384         crm_warn("Message with no size");
 385         sane = FALSE;
 386     }
 387 
 388     if (sane && msg->header.error != CS_OK) {
 389         crm_warn("Message header contains an error: %d", msg->header.error);
 390         sane = FALSE;
 391     }
 392 
 393     if (sane && ais_data_len(msg) != tmp_size) {
 394         crm_warn("Message payload size is incorrect: expected %d, got %d", ais_data_len(msg),
 395                  tmp_size);
 396         sane = TRUE;
 397     }
 398 
 399     if (sane && ais_data_len(msg) == 0) {
 400         crm_warn("Message with no payload");
 401         sane = FALSE;
 402     }
 403 
 404     if (sane && data && msg->is_compressed == FALSE) {
 405         int str_size = strlen(data) + 1;
 406 
 407         if (ais_data_len(msg) != str_size) {
 408             int lpc = 0;
 409 
 410             crm_warn("Message payload is corrupted: expected %d bytes, got %d",
 411                      ais_data_len(msg), str_size);
 412             sane = FALSE;
 413             for (lpc = (str_size - 10); lpc < msg->size; lpc++) {
 414                 if (lpc < 0) {
 415                     lpc = 0;
 416                 }
 417                 crm_debug("bad_data[%d]: %d / '%c'", lpc, data[lpc], data[lpc]);
 418             }
 419         }
 420     }
 421 
 422     if (sane == FALSE) {
 423         crm_err("Invalid message %d: (dest=%s:%s, from=%s:%s.%u, compressed=%d, size=%d, total=%d)",
 424                 msg->id, ais_dest(&(msg->host)), msg_type2text(dest),
 425                 ais_dest(&(msg->sender)), msg_type2text(msg->sender.type),
 426                 msg->sender.pid, msg->is_compressed, ais_data_len(msg), msg->header.size);
 427 
 428     } else {
 429         crm_trace
 430             ("Verified message %d: (dest=%s:%s, from=%s:%s.%u, compressed=%d, size=%d, total=%d)",
 431              msg->id, ais_dest(&(msg->host)), msg_type2text(dest), ais_dest(&(msg->sender)),
 432              msg_type2text(msg->sender.type), msg->sender.pid, msg->is_compressed,
 433              ais_data_len(msg), msg->header.size);
 434     }
 435 
 436     return sane;
 437 }
 438 
 439 enum cluster_type_e
 440 find_corosync_variant(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 441 {
 442     int rc = CS_OK;
 443     cmap_handle_t handle;
 444 
 445     rc = cmap_initialize(&handle);
 446 
 447     switch(rc) {
 448         case CS_OK:
 449             break;
 450         case CS_ERR_SECURITY:
 451             crm_debug("Failed to initialize the cmap API: Permission denied (%d)", rc);
 452             /* It's there, we just can't talk to it.
 453              * Good enough for us to identify as 'corosync'
 454              */
 455             return pcmk_cluster_corosync;
 456 
 457         default:
 458             crm_info("Failed to initialize the cmap API: %s (%d)",
 459                      ais_error2text(rc), rc);
 460             return pcmk_cluster_unknown;
 461     }
 462 
 463     cmap_finalize(handle);
 464     return pcmk_cluster_corosync;
 465 }
 466 
 467 gboolean
 468 crm_is_corosync_peer_active(const crm_node_t * node)
     /* [previous][next][first][last][top][bottom][index][help] */
 469 {
 470     if (node == NULL) {
 471         crm_trace("NULL");
 472         return FALSE;
 473 
 474     } else if (safe_str_neq(node->state, CRM_NODE_MEMBER)) {
 475         crm_trace("%s: state=%s", node->uname, node->state);
 476         return FALSE;
 477 
 478     } else if ((node->processes & crm_proc_cpg) == 0) {
 479         crm_trace("%s: processes=%.16x", node->uname, node->processes);
 480         return FALSE;
 481     }
 482     return TRUE;
 483 }
 484 
 485 gboolean
 486 corosync_initialize_nodelist(void *cluster, gboolean force_member, xmlNode * xml_parent)
     /* [previous][next][first][last][top][bottom][index][help] */
 487 {
 488     int lpc = 0;
 489     int rc = CS_OK;
 490     int retries = 0;
 491     gboolean any = FALSE;
 492     cmap_handle_t cmap_handle;
 493 
 494     do {
 495         rc = cmap_initialize(&cmap_handle);
 496         if (rc != CS_OK) {
 497             retries++;
 498             crm_debug("API connection setup failed: %s.  Retrying in %ds", cs_strerror(rc),
 499                       retries);
 500             sleep(retries);
 501         }
 502 
 503     } while (retries < 5 && rc != CS_OK);
 504 
 505     if (rc != CS_OK) {
 506         crm_warn("Could not connect to Cluster Configuration Database API, error %d", rc);
 507         return FALSE;
 508     }
 509 
 510     crm_peer_init();
 511     crm_trace("Initializing corosync nodelist");
 512     for (lpc = 0; TRUE; lpc++) {
 513         uint32_t nodeid = 0;
 514         char *name = NULL;
 515         char *key = NULL;
 516 
 517         key = crm_strdup_printf("nodelist.node.%d.nodeid", lpc);
 518         rc = cmap_get_uint32(cmap_handle, key, &nodeid);
 519         free(key);
 520 
 521         if (rc != CS_OK) {
 522             break;
 523         }
 524 
 525         name = corosync_node_name(cmap_handle, nodeid);
 526         if (name != NULL) {
 527             GHashTableIter iter;
 528             crm_node_t *node = NULL;
 529 
 530             g_hash_table_iter_init(&iter, crm_peer_cache);
 531             while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
 532                 if(node && node->uname && strcasecmp(node->uname, name) == 0) {
 533                     if (node->id && node->id != nodeid) {
 534                         crm_crit("Nodes %u and %u share the same name '%s': shutting down", node->id,
 535                                  nodeid, name);
 536                         crm_exit(DAEMON_RESPAWN_STOP);
 537                     }
 538                 }
 539             }
 540         }
 541 
 542         if (nodeid > 0 || name != NULL) {
 543             crm_trace("Initializing node[%d] %u = %s", lpc, nodeid, name);
 544             crm_get_peer(nodeid, name);
 545         }
 546 
 547         if (nodeid > 0 && name != NULL) {
 548             any = TRUE;
 549 
 550             if (xml_parent) {
 551                 xmlNode *node = create_xml_node(xml_parent, XML_CIB_TAG_NODE);
 552 
 553                 crm_xml_set_id(node, "%u", nodeid);
 554                 crm_xml_add(node, XML_ATTR_UNAME, name);
 555                 if (force_member) {
 556                     crm_xml_add(node, XML_ATTR_TYPE, CRM_NODE_MEMBER);
 557                 }
 558             }
 559         }
 560 
 561         free(name);
 562     }
 563     cmap_finalize(cmap_handle);
 564     return any;
 565 }
 566 
 567 char *
 568 corosync_cluster_name(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 569 {
 570     cmap_handle_t handle;
 571     char *cluster_name = NULL;
 572     int rc = CS_OK;
 573 
 574     rc = cmap_initialize(&handle);
 575     if (rc != CS_OK) {
 576         crm_info("Failed to initialize the cmap API: %s (%d)", ais_error2text(rc), rc);
 577         return NULL;
 578     }
 579 
 580     rc = cmap_get_string(handle, "totem.cluster_name", &cluster_name);
 581     if (rc != CS_OK) {
 582         crm_info("Cannot get totem.cluster_name: %s (%d)", ais_error2text(rc), rc);
 583 
 584     } else {
 585         crm_debug("cmap totem.cluster_name = '%s'", cluster_name);
 586     }
 587 
 588     cmap_finalize(handle);
 589 
 590     return cluster_name;
 591 }
 592 
 593 int
 594 corosync_cmap_has_config(const char *prefix)
     /* [previous][next][first][last][top][bottom][index][help] */
 595 {
 596     int rc = CS_OK;
 597     int retries = 0;
 598     static int found = -1;
 599     cmap_handle_t cmap_handle;
 600     cmap_iter_handle_t iter_handle;
 601     char key_name[CMAP_KEYNAME_MAXLEN + 1];
 602 
 603     if(found != -1) {
 604         return found;
 605     }
 606 
 607     do {
 608         rc = cmap_initialize(&cmap_handle);
 609         if (rc != CS_OK) {
 610             retries++;
 611             crm_debug("API connection setup failed: %s.  Retrying in %ds", cs_strerror(rc),
 612                       retries);
 613             sleep(retries);
 614         }
 615 
 616     } while (retries < 5 && rc != CS_OK);
 617 
 618     if (rc != CS_OK) {
 619         crm_warn("Could not connect to Cluster Configuration Database API: %s (rc=%d)",
 620                  cs_strerror(rc), rc);
 621         return -1;
 622     }
 623 
 624     rc = cmap_iter_init(cmap_handle, prefix, &iter_handle);
 625     if (rc != CS_OK) {
 626         crm_warn("Failed to initialize iteration for corosync cmap '%s': %s (rc=%d)",
 627                  prefix, cs_strerror(rc), rc);
 628         goto bail;
 629     }
 630 
 631     found = 0;
 632     while ((rc = cmap_iter_next(cmap_handle, iter_handle, key_name, NULL, NULL)) == CS_OK) {
 633         crm_trace("'%s' is configured in corosync cmap: %s", prefix, key_name);
 634         found++;
 635         break;
 636     }
 637     cmap_iter_finalize(cmap_handle, iter_handle);
 638 
 639 bail:
 640     cmap_finalize(cmap_handle);
 641 
 642     return found;
 643 }

/* [previous][next][first][last][top][bottom][index][help] */