This source file includes following definitions.
- election_complete
- election_timer_cb
- election_state
- election_init
- election_remove
- election_reset
- election_fini
- election_timeout_start
- election_timeout_stop
- election_timeout_set_period
- get_uptime
- compare_age
- election_vote
- election_check
- parse_election_message
- record_vote
- send_no_vote
- election_count_vote
- election_clear_dampening
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/time.h>
13 #include <sys/resource.h>
14
15 #include <crm/crm.h>
16 #include <crm/common/mainloop.h>
17 #include <crm/common/xml.h>
18
19 #include <crm/cluster/internal.h>
20 #include <crm/cluster/election_internal.h>
21 #include "crmcluster_private.h"
22
23 #define STORM_INTERVAL 2
24
25 struct pcmk__election {
26 enum election_result state;
27 guint count;
28 void (*cb)(pcmk_cluster_t *);
29 GHashTable *voted;
30 mainloop_timer_t *timeout;
31 int election_wins;
32 bool wrote_blackbox;
33 time_t expires;
34 time_t last_election_loss;
35 };
36
37 static void
38 election_complete(pcmk_cluster_t *cluster)
39 {
40 pcmk__assert((cluster != NULL) && (cluster->priv->election != NULL));
41 cluster->priv->election->state = election_won;
42 if (cluster->priv->election->cb != NULL) {
43 cluster->priv->election->cb(cluster);
44 }
45 election_reset(cluster);
46 }
47
48 static gboolean
49 election_timer_cb(gpointer user_data)
50 {
51 pcmk_cluster_t *cluster = user_data;
52
53 crm_info("Declaring local node as winner after election timed out");
54 election_complete(cluster);
55 return FALSE;
56 }
57
58
59
60
61
62
63
64
65
66 enum election_result
67 election_state(const pcmk_cluster_t *cluster)
68 {
69 if ((cluster == NULL) || (cluster->priv->election == NULL)) {
70 return election_error;
71 }
72 return cluster->priv->election->state;
73 }
74
75
76
77
78
79 #define ELECTION_TIMEOUT_MS 120000
80
81
82
83
84
85
86
87
88
89
90
91 void
92 election_init(pcmk_cluster_t *cluster, void (*cb)(pcmk_cluster_t *))
93 {
94 const char *name = pcmk__s(crm_system_name, "election");
95
96 CRM_CHECK(cluster->priv->election == NULL, return);
97
98 cluster->priv->election = pcmk__assert_alloc(1, sizeof(pcmk__election_t));
99 cluster->priv->election->cb = cb;
100 cluster->priv->election->timeout = mainloop_timer_add(name,
101 ELECTION_TIMEOUT_MS,
102 FALSE,
103 election_timer_cb,
104 cluster);
105 }
106
107
108
109
110
111
112
113
114
115
116
117 void
118 election_remove(pcmk_cluster_t *cluster, const char *uname)
119 {
120 if ((cluster != NULL) && (cluster->priv->election != NULL)
121 && (uname != NULL) && (cluster->priv->election->voted != NULL)) {
122 crm_trace("Discarding (no-)vote from lost peer %s", uname);
123 g_hash_table_remove(cluster->priv->election->voted, uname);
124 }
125 }
126
127
128
129
130
131
132
133 void
134 election_reset(pcmk_cluster_t *cluster)
135 {
136 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
137 crm_trace("Resetting election");
138 mainloop_timer_stop(cluster->priv->election->timeout);
139 if (cluster->priv->election->voted != NULL) {
140 g_hash_table_destroy(cluster->priv->election->voted);
141 cluster->priv->election->voted = NULL;
142 }
143 }
144 }
145
146
147
148
149
150
151
152
153
154
155 void
156 election_fini(pcmk_cluster_t *cluster)
157 {
158 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
159 election_reset(cluster);
160 crm_trace("Destroying election");
161 mainloop_timer_del(cluster->priv->election->timeout);
162 free(cluster->priv->election);
163 cluster->priv->election = NULL;
164 }
165 }
166
167 static void
168 election_timeout_start(pcmk_cluster_t *cluster)
169 {
170 mainloop_timer_start(cluster->priv->election->timeout);
171 }
172
173
174
175
176
177
178
179 void
180 election_timeout_stop(pcmk_cluster_t *cluster)
181 {
182 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
183 mainloop_timer_stop(cluster->priv->election->timeout);
184 }
185 }
186
187
188
189
190
191
192
193
194 void
195 election_timeout_set_period(pcmk_cluster_t *cluster, guint period)
196 {
197 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
198 mainloop_timer_set_period(cluster->priv->election->timeout, period);
199 }
200
201 static int
202 get_uptime(struct timeval *output)
203 {
204 static time_t expires = 0;
205 static struct rusage info;
206
207 time_t tm_now = time(NULL);
208
209 if (expires < tm_now) {
210 int rc = 0;
211
212 info.ru_utime.tv_sec = 0;
213 info.ru_utime.tv_usec = 0;
214 rc = getrusage(RUSAGE_SELF, &info);
215
216 output->tv_sec = 0;
217 output->tv_usec = 0;
218
219 if (rc < 0) {
220 crm_perror(LOG_ERR, "Could not calculate the current uptime");
221 expires = 0;
222 return -1;
223 }
224
225 crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
226 (long)info.ru_utime.tv_usec);
227 }
228
229 expires = tm_now + STORM_INTERVAL;
230 output->tv_sec = info.ru_utime.tv_sec;
231 output->tv_usec = info.ru_utime.tv_usec;
232
233 return 1;
234 }
235
236 static int
237 compare_age(struct timeval your_age)
238 {
239 struct timeval our_age;
240
241 get_uptime(&our_age);
242
243 if (our_age.tv_sec > your_age.tv_sec) {
244 crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
245 return 1;
246 } else if (our_age.tv_sec < your_age.tv_sec) {
247 crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
248 return -1;
249 } else if (our_age.tv_usec > your_age.tv_usec) {
250 crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)",
251 (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
252 return 1;
253 } else if (our_age.tv_usec < your_age.tv_usec) {
254 crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)",
255 (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
256 return -1;
257 }
258
259 return 0;
260 }
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277 void
278 election_vote(pcmk_cluster_t *cluster)
279 {
280 struct timeval age;
281 xmlNode *vote = NULL;
282 pcmk__node_status_t *our_node = NULL;
283 const char *message_type = NULL;
284
285 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
286
287 if (cluster->priv->node_name == NULL) {
288 crm_err("Cannot start an election: Local node name unknown");
289 return;
290 }
291
292 our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
293 pcmk__node_search_cluster_member);
294 if (!pcmk__cluster_is_node_active(our_node)) {
295 crm_trace("Cannot vote yet: local node not connected to cluster");
296 return;
297 }
298
299 election_reset(cluster);
300 cluster->priv->election->state = election_in_progress;
301 message_type = pcmk__server_message_type(cluster->priv->server);
302
303
304
305
306 vote = pcmk__new_request(cluster->priv->server, message_type,
307 NULL, message_type, CRM_OP_VOTE, NULL);
308
309 cluster->priv->election->count++;
310 crm_xml_add(vote, PCMK__XA_ELECTION_OWNER,
311 pcmk__cluster_get_xml_id(our_node));
312 crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, cluster->priv->election->count);
313
314
315 get_uptime(&age);
316 crm_xml_add_timeval(vote, PCMK__XA_ELECTION_AGE_SEC,
317 PCMK__XA_ELECTION_AGE_NANO_SEC, &age);
318
319 pcmk__cluster_send_message(NULL, cluster->priv->server, vote);
320 pcmk__xml_free(vote);
321
322 crm_debug("Started election round %u", cluster->priv->election->count);
323 election_timeout_start(cluster);
324 return;
325 }
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343 bool
344 election_check(pcmk_cluster_t *cluster)
345 {
346 int voted_size = 0;
347 int num_members = 0;
348
349 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL),
350 return false);
351
352 if (cluster->priv->election->voted == NULL) {
353 crm_trace("Election check requested, but no votes received yet");
354 return FALSE;
355 }
356
357 voted_size = g_hash_table_size(cluster->priv->election->voted);
358 num_members = pcmk__cluster_num_active_nodes();
359
360
361
362
363
364 if (voted_size >= num_members) {
365
366 election_timeout_stop(cluster);
367 if (voted_size > num_members) {
368 GHashTableIter gIter;
369 const pcmk__node_status_t *node = NULL;
370 char *key = NULL;
371
372 crm_warn("Received too many votes in election");
373 g_hash_table_iter_init(&gIter, pcmk__peer_cache);
374 while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
375 if (pcmk__cluster_is_node_active(node)) {
376 crm_warn("* expected vote: %s", node->name);
377 }
378 }
379
380 g_hash_table_iter_init(&gIter, cluster->priv->election->voted);
381 while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
382 crm_warn("* actual vote: %s", key);
383 }
384
385 }
386
387 crm_info("Election won by local node");
388 election_complete(cluster);
389 return TRUE;
390
391 } else {
392 crm_debug("Election still waiting on %d of %d vote%s",
393 num_members - voted_size, num_members,
394 pcmk__plural_s(num_members));
395 }
396
397 return FALSE;
398 }
399
400 #define LOSS_DAMPEN 2
401
402 struct vote {
403 const char *op;
404 const char *from;
405 const char *version;
406 const char *election_owner;
407 int election_id;
408 struct timeval age;
409 };
410
411
412
413
414
415
416
417
418
419
420
421
422 static bool
423 parse_election_message(const xmlNode *message, struct vote *vote)
424 {
425 CRM_CHECK(message && vote, return FALSE);
426
427 vote->election_id = -1;
428 vote->age.tv_sec = -1;
429 vote->age.tv_usec = -1;
430
431 vote->op = crm_element_value(message, PCMK__XA_CRM_TASK);
432 vote->from = crm_element_value(message, PCMK__XA_SRC);
433 vote->version = crm_element_value(message, PCMK_XA_VERSION);
434 vote->election_owner = crm_element_value(message, PCMK__XA_ELECTION_OWNER);
435
436 crm_element_value_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id));
437
438 if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
439 || (vote->election_owner == NULL) || (vote->election_id < 0)) {
440
441 crm_warn("Invalid %s message from %s",
442 pcmk__s(vote->op, "election"),
443 pcmk__s(vote->from, "unspecified node"));
444 crm_log_xml_trace(message, "bad-vote");
445 return FALSE;
446 }
447
448
449
450 if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) {
451
452
453
454 crm_element_value_timeval(message, PCMK__XA_ELECTION_AGE_SEC,
455 PCMK__XA_ELECTION_AGE_NANO_SEC, &(vote->age));
456 if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
457 crm_warn("Cannot count election %s from %s "
458 "because it is missing uptime", vote->op, vote->from);
459 return FALSE;
460 }
461
462 } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) {
463 crm_info("Cannot process election message from %s "
464 "because %s is not a known election op", vote->from, vote->op);
465 return FALSE;
466 }
467
468
469
470
471 if (pcmk__peer_cache == NULL) {
472 crm_info("Cannot count election %s from %s "
473 "because no peer information available", vote->op, vote->from);
474 return FALSE;
475 }
476 return TRUE;
477 }
478
479 static void
480 record_vote(pcmk_cluster_t *cluster, struct vote *vote)
481 {
482 pcmk__assert((vote->from != NULL) && (vote->op != NULL));
483
484 if (cluster->priv->election->voted == NULL) {
485 cluster->priv->election->voted = pcmk__strkey_table(free, free);
486 }
487 pcmk__insert_dup(cluster->priv->election->voted, vote->from, vote->op);
488 }
489
490 static void
491 send_no_vote(pcmk_cluster_t *cluster, pcmk__node_status_t *peer,
492 struct vote *vote)
493 {
494 const char *message_type = NULL;
495 xmlNode *novote = NULL;
496
497 message_type = pcmk__server_message_type(cluster->priv->server);
498 novote = pcmk__new_request(cluster->priv->server, message_type,
499 vote->from, message_type, CRM_OP_NOVOTE, NULL);
500 crm_xml_add(novote, PCMK__XA_ELECTION_OWNER, vote->election_owner);
501 crm_xml_add_int(novote, PCMK__XA_ELECTION_ID, vote->election_id);
502
503 pcmk__cluster_send_message(peer, cluster->priv->server, novote);
504 pcmk__xml_free(novote);
505 }
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523 enum election_result
524 election_count_vote(pcmk_cluster_t *cluster, const xmlNode *message,
525 bool can_win)
526 {
527 int log_level = LOG_INFO;
528 gboolean done = FALSE;
529 gboolean we_lose = FALSE;
530 const char *reason = NULL;
531 bool we_are_owner = FALSE;
532 pcmk__node_status_t *our_node = NULL;
533 pcmk__node_status_t *your_node = NULL;
534 time_t tm_now = time(NULL);
535 struct vote vote;
536
537 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL)
538 && (message != NULL) && (cluster->priv->node_name != NULL),
539 return election_error);
540
541 if (!parse_election_message(message, &vote)) {
542 return election_error;
543 }
544
545 your_node = pcmk__get_node(0, vote.from, NULL,
546 pcmk__node_search_cluster_member);
547 our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
548 pcmk__node_search_cluster_member);
549 we_are_owner = (our_node != NULL)
550 && pcmk__str_eq(pcmk__cluster_get_xml_id(our_node),
551 vote.election_owner, pcmk__str_none);
552
553 if (!can_win) {
554 reason = "Not eligible";
555 we_lose = TRUE;
556
557 } else if (!pcmk__cluster_is_node_active(our_node)) {
558 reason = "We are not part of the cluster";
559 log_level = LOG_ERR;
560 we_lose = TRUE;
561
562 } else if (we_are_owner
563 && (vote.election_id != cluster->priv->election->count)) {
564 log_level = LOG_TRACE;
565 reason = "Superseded";
566 done = TRUE;
567
568 } else if (!pcmk__cluster_is_node_active(your_node)) {
569
570 reason = "Peer is not part of our cluster";
571 log_level = LOG_WARNING;
572 done = TRUE;
573
574 } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none)
575 || pcmk__str_eq(vote.from, cluster->priv->node_name,
576 pcmk__str_casei)) {
577
578
579
580 if (!we_are_owner) {
581 crm_warn("Cannot count election round %d %s from %s "
582 "because we did not start election (node ID %s did)",
583 vote.election_id, vote.op, vote.from,
584 vote.election_owner);
585 return election_error;
586 }
587 if (cluster->priv->election->state != election_in_progress) {
588
589 crm_debug("Not counting election round %d %s from %s "
590 "because no election in progress",
591 vote.election_id, vote.op, vote.from);
592 return cluster->priv->election->state;
593 }
594 record_vote(cluster, &vote);
595 reason = "Recorded";
596 done = TRUE;
597
598 } else {
599
600 int age_result = compare_age(vote.age);
601 int version_result = compare_version(vote.version, CRM_FEATURE_SET);
602
603 if (version_result < 0) {
604 reason = "Version";
605 we_lose = TRUE;
606
607 } else if (version_result > 0) {
608 reason = "Version";
609
610 } else if (age_result < 0) {
611 reason = "Uptime";
612 we_lose = TRUE;
613
614 } else if (age_result > 0) {
615 reason = "Uptime";
616
617 } else if (strcasecmp(cluster->priv->node_name, vote.from) > 0) {
618 reason = "Host name";
619 we_lose = TRUE;
620
621 } else {
622 reason = "Host name";
623 }
624 }
625
626 if (cluster->priv->election->expires < tm_now) {
627 cluster->priv->election->election_wins = 0;
628 cluster->priv->election->expires = tm_now + STORM_INTERVAL;
629
630 } else if (done == FALSE && we_lose == FALSE) {
631 int peers = 1 + g_hash_table_size(pcmk__peer_cache);
632
633
634
635
636 cluster->priv->election->election_wins++;
637 if (cluster->priv->election->election_wins > (peers * peers)) {
638 crm_warn("Election storm detected: %d wins in %d seconds",
639 cluster->priv->election->election_wins, STORM_INTERVAL);
640 cluster->priv->election->election_wins = 0;
641 cluster->priv->election->expires = tm_now + STORM_INTERVAL;
642 if (!(cluster->priv->election->wrote_blackbox)) {
643
644
645
646
647
648
649
650
651
652
653 crm_write_blackbox(0, NULL);
654 cluster->priv->election->wrote_blackbox = true;
655 }
656 }
657 }
658
659 if (done) {
660 do_crm_log(log_level + 1,
661 "Processed election round %u %s (current round %d) "
662 "from %s (%s)",
663 vote.election_id, vote.op, cluster->priv->election->count,
664 vote.from, reason);
665 return cluster->priv->election->state;
666
667 } else if (we_lose == FALSE) {
668
669
670
671
672
673
674
675
676
677
678
679
680
681 if ((cluster->priv->election->last_election_loss == 0)
682 || ((tm_now - cluster->priv->election->last_election_loss)
683 > (time_t) LOSS_DAMPEN)) {
684
685 do_crm_log(log_level,
686 "Election round %d (started by node ID %s) pass: "
687 "%s from %s (%s)",
688 vote.election_id, vote.election_owner, vote.op,
689 vote.from, reason);
690
691 cluster->priv->election->last_election_loss = 0;
692 election_timeout_stop(cluster);
693
694
695 cluster->priv->election->state = election_start;
696 return cluster->priv->election->state;
697 } else {
698 char *loss_time = NULL;
699
700 loss_time = ctime(&(cluster->priv->election->last_election_loss));
701 if (loss_time) {
702
703 loss_time += 11;
704 loss_time[8] = '\0';
705 }
706 crm_info("Ignoring election round %d (started by node ID %s) pass "
707 "vs %s because we lost less than %ds ago at %s",
708 vote.election_id, vote.election_owner, vote.from,
709 LOSS_DAMPEN, (loss_time? loss_time : "unknown"));
710 }
711 }
712
713 cluster->priv->election->last_election_loss = tm_now;
714
715 do_crm_log(log_level,
716 "Election round %d (started by node ID %s) lost: "
717 "%s from %s (%s)",
718 vote.election_id, vote.election_owner, vote.op,
719 vote.from, reason);
720
721 election_reset(cluster);
722 send_no_vote(cluster, your_node, &vote);
723 cluster->priv->election->state = election_lost;
724 return cluster->priv->election->state;
725 }
726
727
728
729
730
731
732
733 void
734 election_clear_dampening(pcmk_cluster_t *cluster)
735 {
736 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
737 cluster->priv->election->last_election_loss = 0;
738 }
739 }