root/lib/cluster/election.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. election_complete
  2. election_timer_cb
  3. election_state
  4. election_init
  5. election_remove
  6. election_reset
  7. election_fini
  8. election_timeout_start
  9. election_timeout_stop
  10. election_timeout_set_period
  11. crm_uptime
  12. crm_compare_age
  13. election_vote
  14. election_check
  15. election_count_vote

   1 /*
   2  * Copyright (C) 2004-2016 Andrew Beekhof <andrew@beekhof.net>
   3  *
   4  * This source code is licensed under the GNU Lesser General Public License
   5  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   6  */
   7 
   8 #include <crm_internal.h>
   9 
  10 #include <sys/time.h>
  11 #include <sys/resource.h>
  12 
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml.h>
  15 
  16 #include <crm/common/mainloop.h>
  17 #include <crm/cluster/internal.h>
  18 #include <crm/cluster/election.h>
  19 #include <crm/crm.h>
  20 
  21 #define STORM_INTERVAL   2      /* in seconds */
  22 
  23 struct election_s
  24 {
  25         enum election_result state;
  26         guint count;
  27         char *name;
  28         char *uname;
  29         GSourceFunc cb;
  30         GHashTable *voted;
  31         mainloop_timer_t *timeout; /* When to stop if not everyone casts a vote */
  32 };
  33 
  34 static void election_complete(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
  35 {
  36     crm_info("Election %s complete", e->name);
  37     e->state = election_won;
  38 
  39     if(e->cb) {
  40         e->cb(e);
  41     }
  42 
  43     election_reset(e);
  44 }
  45 
  46 static gboolean election_timer_cb(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  47 {
  48     election_t *e = user_data;
  49 
  50     crm_info("Election %s %p timed out", e->name, e);
  51     election_complete(e);
  52     return FALSE;
  53 }
  54 
  55 enum election_result
  56 election_state(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
  57 {
  58     if(e) {
  59         return e->state;
  60     }
  61     return election_error;
  62 }
  63 
  64 election_t *
  65 election_init(const char *name, const char *uname, guint period_ms, GSourceFunc cb)
     /* [previous][next][first][last][top][bottom][index][help] */
  66 {
  67     static guint count = 0;
  68     election_t *e = calloc(1, sizeof(election_t));
  69 
  70     if(e != NULL) {
  71         if(name) {
  72             e->name = crm_strdup_printf("election-%s", name);
  73         } else {
  74             e->name = crm_strdup_printf("election-%u", count++);
  75         }
  76 
  77         e->cb = cb;
  78         e->uname = strdup(uname);
  79         e->timeout = mainloop_timer_add(e->name, period_ms, FALSE, election_timer_cb, e);
  80         crm_trace("Created %s %p", e->name, e);
  81     }
  82     return e;
  83 }
  84 
  85 void
  86 election_remove(election_t *e, const char *uname)
     /* [previous][next][first][last][top][bottom][index][help] */
  87 {
  88     if(e && uname && e->voted) {
  89         g_hash_table_remove(e->voted, uname);
  90     }
  91 }
  92 
  93 void
  94 election_reset(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
  95 {
  96     crm_trace("Resetting election %s", e->name);
  97     if(e) {
  98         mainloop_timer_stop(e->timeout);
  99     }
 100     if (e && e->voted) {
 101         crm_trace("Destroying voted cache with %d members", g_hash_table_size(e->voted));
 102         g_hash_table_destroy(e->voted);
 103         e->voted = NULL;
 104     }
 105 }
 106 
 107 void
 108 election_fini(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 109 {
 110     if(e) {
 111         election_reset(e);
 112         crm_trace("Destroying %s", e->name);
 113         mainloop_timer_del(e->timeout);
 114         free(e->uname);
 115         free(e->name);
 116         free(e);
 117     }
 118 }
 119 
 120 static void
 121 election_timeout_start(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 122 {
 123     if(e) {
 124         mainloop_timer_start(e->timeout);
 125     }
 126 }
 127 
 128 void
 129 election_timeout_stop(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 130 {
 131     if(e) {
 132         mainloop_timer_stop(e->timeout);
 133     }
 134 }
 135 
 136 void
 137 election_timeout_set_period(election_t *e, guint period)
     /* [previous][next][first][last][top][bottom][index][help] */
 138 {
 139     if(e) {
 140         mainloop_timer_set_period(e->timeout, period);
 141     } else {
 142         crm_err("No election defined");
 143     }
 144 }
 145 
 146 static int
 147 crm_uptime(struct timeval *output)
     /* [previous][next][first][last][top][bottom][index][help] */
 148 {
 149     static time_t expires = 0;
 150     static struct rusage info;
 151 
 152     time_t tm_now = time(NULL);
 153 
 154     if (expires < tm_now) {
 155         int rc = 0;
 156 
 157         info.ru_utime.tv_sec = 0;
 158         info.ru_utime.tv_usec = 0;
 159         rc = getrusage(RUSAGE_SELF, &info);
 160 
 161         output->tv_sec = 0;
 162         output->tv_usec = 0;
 163 
 164         if (rc < 0) {
 165             crm_perror(LOG_ERR, "Could not calculate the current uptime");
 166             expires = 0;
 167             return -1;
 168         }
 169 
 170         crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
 171                   (long)info.ru_utime.tv_usec);
 172     }
 173 
 174     expires = tm_now + STORM_INTERVAL;  /* N seconds after the last _access_ */
 175     output->tv_sec = info.ru_utime.tv_sec;
 176     output->tv_usec = info.ru_utime.tv_usec;
 177 
 178     return 1;
 179 }
 180 
 181 static int
 182 crm_compare_age(struct timeval your_age)
     /* [previous][next][first][last][top][bottom][index][help] */
 183 {
 184     struct timeval our_age;
 185 
 186     crm_uptime(&our_age); /* If an error occurred, our_age will be compared as {0,0} */
 187 
 188     if (our_age.tv_sec > your_age.tv_sec) {
 189         crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
 190         return 1;
 191     } else if (our_age.tv_sec < your_age.tv_sec) {
 192         crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
 193         return -1;
 194     } else if (our_age.tv_usec > your_age.tv_usec) {
 195         crm_debug("Win: %ld.%ld vs %ld.%ld (usec)",
 196                   (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
 197         return 1;
 198     } else if (our_age.tv_usec < your_age.tv_usec) {
 199         crm_debug("Lose: %ld.%ld vs %ld.%ld (usec)",
 200                   (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
 201         return -1;
 202     }
 203 
 204     return 0;
 205 }
 206 
 207 void
 208 election_vote(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 209 {
 210     struct timeval age;
 211     xmlNode *vote = NULL;
 212     crm_node_t *our_node;
 213 
 214     if(e == NULL) {
 215         crm_trace("Not voting in election: not initialized");
 216         return;
 217     }
 218 
 219     our_node = crm_get_peer(0, e->uname);
 220     if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
 221         crm_trace("Cannot vote yet: %p", our_node);
 222         return;
 223     }
 224 
 225     e->state = election_in_progress;
 226     vote = create_request(CRM_OP_VOTE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
 227 
 228     e->count++;
 229     crm_xml_add(vote, F_CRM_ELECTION_OWNER, our_node->uuid);
 230     crm_xml_add_int(vote, F_CRM_ELECTION_ID, e->count);
 231 
 232     crm_uptime(&age);
 233     crm_xml_add_int(vote, F_CRM_ELECTION_AGE_S, age.tv_sec);
 234     crm_xml_add_int(vote, F_CRM_ELECTION_AGE_US, age.tv_usec);
 235 
 236     send_cluster_message(NULL, crm_msg_crmd, vote, TRUE);
 237     free_xml(vote);
 238 
 239     crm_debug("Started election %d", e->count);
 240     if (e->voted) {
 241         g_hash_table_destroy(e->voted);
 242         e->voted = NULL;
 243     }
 244 
 245     election_timeout_start(e);
 246     return;
 247 }
 248 
 249 bool
 250 election_check(election_t *e)
     /* [previous][next][first][last][top][bottom][index][help] */
 251 {
 252     int voted_size = 0;
 253     int num_members = crm_active_peers();
 254 
 255     if(e == NULL) {
 256         crm_trace("not initialized");
 257         return FALSE;
 258     }
 259 
 260     if (e->voted) {
 261         voted_size = g_hash_table_size(e->voted);
 262     }
 263     /* in the case of #voted > #members, it is better to
 264      *   wait for the timeout and give the cluster time to
 265      *   stabilize
 266      */
 267     if (voted_size >= num_members) {
 268         /* we won and everyone has voted */
 269         election_timeout_stop(e);
 270         if (voted_size > num_members) {
 271             GHashTableIter gIter;
 272             const crm_node_t *node;
 273             char *key = NULL;
 274 
 275             g_hash_table_iter_init(&gIter, crm_peer_cache);
 276             while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
 277                 if (crm_is_peer_active(node)) {
 278                     crm_err("member: %s proc=%.32x", node->uname, node->processes);
 279                 }
 280             }
 281 
 282             g_hash_table_iter_init(&gIter, e->voted);
 283             while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
 284                 crm_err("voted: %s", key);
 285             }
 286 
 287         }
 288 
 289         election_complete(e);
 290         return TRUE;
 291 
 292     } else {
 293         crm_debug("Still waiting on %d non-votes (%d total)",
 294                   num_members - voted_size, num_members);
 295     }
 296 
 297     return FALSE;
 298 }
 299 
 300 #define loss_dampen 2           /* in seconds */
 301 
 302 /*      A_ELECTION_COUNT        */
 303 enum election_result
 304 election_count_vote(election_t *e, xmlNode *vote, bool can_win)
     /* [previous][next][first][last][top][bottom][index][help] */
 305 {
 306     int age = 0;
 307     int election_id = -1;
 308     int log_level = LOG_INFO;
 309     gboolean use_born_on = FALSE;
 310     gboolean done = FALSE;
 311     gboolean we_lose = FALSE;
 312     const char *op = NULL;
 313     const char *from = NULL;
 314     const char *reason = "unknown";
 315     const char *election_owner = NULL;
 316     crm_node_t *our_node = NULL, *your_node = NULL;
 317 
 318     static int election_wins = 0;
 319 
 320     xmlNode *novote = NULL;
 321     time_t tm_now = time(NULL);
 322     static time_t expires = 0;
 323     static time_t last_election_loss = 0;
 324 
 325     /* if the membership copy is NULL we REALLY shouldn't be voting
 326      * the question is how we managed to get here.
 327      */
 328 
 329     CRM_CHECK(vote != NULL, return election_error);
 330 
 331     if(e == NULL) {
 332         crm_info("Not voting in election: not initialized");
 333         return election_lost;
 334 
 335     } else if(crm_peer_cache == NULL) {
 336         crm_info("Not voting in election: no peer cache");
 337         return election_lost;
 338     }
 339 
 340     op = crm_element_value(vote, F_CRM_TASK);
 341     from = crm_element_value(vote, F_CRM_HOST_FROM);
 342     election_owner = crm_element_value(vote, F_CRM_ELECTION_OWNER);
 343     crm_element_value_int(vote, F_CRM_ELECTION_ID, &election_id);
 344 
 345     your_node = crm_get_peer(0, from);
 346     our_node = crm_get_peer(0, e->uname);
 347 
 348     if (e->voted == NULL) {
 349         crm_debug("Created voted hash");
 350         e->voted = crm_str_table_new();
 351     }
 352 
 353     if (is_heartbeat_cluster()) {
 354         use_born_on = TRUE;
 355     } else if (is_classic_ais_cluster()) {
 356         use_born_on = TRUE;
 357     }
 358 
 359     if(can_win == FALSE) {
 360         reason = "Not eligible";
 361         we_lose = TRUE;
 362 
 363     } else if (our_node == NULL || crm_is_peer_active(our_node) == FALSE) {
 364         reason = "We are not part of the cluster";
 365         log_level = LOG_ERR;
 366         we_lose = TRUE;
 367 
 368     } else if (election_id != e->count && crm_str_eq(our_node->uuid, election_owner, TRUE)) {
 369         log_level = LOG_TRACE;
 370         reason = "Superseded";
 371         done = TRUE;
 372 
 373     } else if (your_node == NULL || crm_is_peer_active(your_node) == FALSE) {
 374         /* Possibly we cached the message in the FSA queue at a point that it wasn't */
 375         reason = "Peer is not part of our cluster";
 376         log_level = LOG_WARNING;
 377         done = TRUE;
 378 
 379     } else if (crm_str_eq(op, CRM_OP_NOVOTE, TRUE)) {
 380         char *op_copy = strdup(op);
 381         char *uname_copy = strdup(from);
 382 
 383         CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
 384 
 385         /* update the list of nodes that have voted */
 386         g_hash_table_replace(e->voted, uname_copy, op_copy);
 387         reason = "Recorded";
 388         done = TRUE;
 389 
 390     } else {
 391         struct timeval your_age;
 392         const char *your_version = crm_element_value(vote, F_CRM_VERSION);
 393         int tv_sec = 0;
 394         int tv_usec = 0;
 395 
 396         crm_element_value_int(vote, F_CRM_ELECTION_AGE_S, &tv_sec);
 397         crm_element_value_int(vote, F_CRM_ELECTION_AGE_US, &tv_usec);
 398 
 399         your_age.tv_sec = tv_sec;
 400         your_age.tv_usec = tv_usec;
 401 
 402         age = crm_compare_age(your_age);
 403         if (crm_str_eq(from, e->uname, TRUE)) {
 404             char *op_copy = strdup(op);
 405             char *uname_copy = strdup(from);
 406 
 407             CRM_ASSERT(crm_str_eq(our_node->uuid, election_owner, TRUE));
 408 
 409             /* update ourselves in the list of nodes that have voted */
 410             g_hash_table_replace(e->voted, uname_copy, op_copy);
 411             reason = "Recorded";
 412             done = TRUE;
 413 
 414         } else if (compare_version(your_version, CRM_FEATURE_SET) < 0) {
 415             reason = "Version";
 416             we_lose = TRUE;
 417 
 418         } else if (compare_version(your_version, CRM_FEATURE_SET) > 0) {
 419             reason = "Version";
 420 
 421         } else if (age < 0) {
 422             reason = "Uptime";
 423             we_lose = TRUE;
 424 
 425         } else if (age > 0) {
 426             reason = "Uptime";
 427 
 428             /* TODO: Check for y(our) born < 0 */
 429         } else if (use_born_on && your_node->born < our_node->born) {
 430             reason = "Born";
 431             we_lose = TRUE;
 432 
 433         } else if (use_born_on && your_node->born > our_node->born) {
 434             reason = "Born";
 435 
 436         } else if (e->uname == NULL) {
 437             reason = "Unknown host name";
 438             we_lose = TRUE;
 439 
 440         } else if (strcasecmp(e->uname, from) > 0) {
 441             reason = "Host name";
 442             we_lose = TRUE;
 443 
 444         } else {
 445             reason = "Host name";
 446             CRM_ASSERT(strcasecmp(e->uname, from) < 0);
 447 /* can't happen...
 448  *      } else if(strcasecmp(e->uname, from) == 0) {
 449  *
 450  */
 451         }
 452     }
 453 
 454     if (expires < tm_now) {
 455         election_wins = 0;
 456         expires = tm_now + STORM_INTERVAL;
 457 
 458     } else if (done == FALSE && we_lose == FALSE) {
 459         int peers = 1 + g_hash_table_size(crm_peer_cache);
 460 
 461         /* If every node has to vote down every other node, thats N*(N-1) total elections
 462          * Allow some leeway before _really_ complaining
 463          */
 464         election_wins++;
 465         if (election_wins > (peers * peers)) {
 466             crm_warn("Election storm detected: %d elections in %d seconds", election_wins,
 467                      STORM_INTERVAL);
 468             election_wins = 0;
 469             expires = tm_now + STORM_INTERVAL;
 470             crm_write_blackbox(0, NULL);
 471         }
 472     }
 473 
 474     if (done) {
 475         do_crm_log(log_level + 1, "Election %d (current: %d, owner: %s): Processed %s from %s (%s)",
 476                    election_id, e->count, election_owner, op, from, reason);
 477         return e->state;
 478 
 479     } else if (we_lose == FALSE) {
 480         do_crm_log(log_level, "Election %d (owner: %s) pass: %s from %s (%s)",
 481                    election_id, election_owner, op, from, reason);
 482 
 483         if (last_election_loss == 0
 484             || tm_now - last_election_loss > (time_t) loss_dampen) {
 485 
 486             last_election_loss = 0;
 487             election_timeout_stop(e);
 488 
 489             /* Start a new election by voting down this, and other, peers */
 490             e->state = election_start;
 491             return e->state;
 492         }
 493 
 494         crm_info("Election %d ignore: We already lost an election less than %ds ago (%s)",
 495                  election_id, loss_dampen, ctime(&last_election_loss));
 496     }
 497 
 498     novote = create_request(CRM_OP_NOVOTE, NULL, from,
 499                             CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
 500 
 501     do_crm_log(log_level, "Election %d (owner: %s) lost: %s from %s (%s)",
 502                election_id, election_owner, op, from, reason);
 503 
 504     election_timeout_stop(e);
 505 
 506     crm_xml_add(novote, F_CRM_ELECTION_OWNER, election_owner);
 507     crm_xml_add_int(novote, F_CRM_ELECTION_ID, election_id);
 508 
 509     send_cluster_message(your_node, crm_msg_crmd, novote, TRUE);
 510     free_xml(novote);
 511 
 512     last_election_loss = tm_now;
 513     e->state = election_lost;
 514     return e->state;
 515 }

/* [previous][next][first][last][top][bottom][index][help] */