This source file includes following definitions.
- election_complete
- election_timer_cb
- election_state
- election_init
- election_remove
- election_reset
- election_fini
- election_timeout_start
- election_timeout_stop
- election_timeout_set_period
- get_uptime
- compare_age
- election_vote
- election_check
- parse_election_message
- record_vote
- send_no_vote
- election_count_vote
- election_clear_dampening
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/time.h>
13 #include <sys/resource.h>
14
15 #include <crm/crm.h>
16 #include <crm/common/mainloop.h>
17 #include <crm/common/xml.h>
18
19 #include <crm/cluster/internal.h>
20 #include <crm/cluster/election_internal.h>
21 #include "crmcluster_private.h"
22
23 #define STORM_INTERVAL 2
24
25 struct pcmk__election {
26 enum election_result state;
27 guint count;
28 void (*cb)(pcmk_cluster_t *);
29 GHashTable *voted;
30 mainloop_timer_t *timeout;
31 int election_wins;
32 bool wrote_blackbox;
33 time_t expires;
34 time_t last_election_loss;
35 };
36
37 static void
38 election_complete(pcmk_cluster_t *cluster)
39 {
40 pcmk__assert((cluster != NULL) && (cluster->priv->election != NULL));
41 cluster->priv->election->state = election_won;
42 if (cluster->priv->election->cb != NULL) {
43 cluster->priv->election->cb(cluster);
44 }
45 election_reset(cluster);
46 }
47
48 static gboolean
49 election_timer_cb(gpointer user_data)
50 {
51 pcmk_cluster_t *cluster = user_data;
52
53 crm_info("Declaring local node as winner after election timed out");
54 election_complete(cluster);
55 return FALSE;
56 }
57
58
59
60
61
62
63
64
65
66 enum election_result
67 election_state(const pcmk_cluster_t *cluster)
68 {
69 if ((cluster == NULL) || (cluster->priv->election == NULL)) {
70 return election_error;
71 }
72 return cluster->priv->election->state;
73 }
74
75
76
77
78
79 #define ELECTION_TIMEOUT_MS 120000
80
81
82
83
84
85
86
87
88
89
90
91 void
92 election_init(pcmk_cluster_t *cluster, void (*cb)(pcmk_cluster_t *))
93 {
94 const char *name = pcmk__s(crm_system_name, "election");
95
96 CRM_CHECK(cluster->priv->election == NULL, return);
97
98 cluster->priv->election = pcmk__assert_alloc(1, sizeof(pcmk__election_t));
99 cluster->priv->election->cb = cb;
100 cluster->priv->election->timeout = mainloop_timer_add(name,
101 ELECTION_TIMEOUT_MS,
102 FALSE,
103 election_timer_cb,
104 cluster);
105 }
106
107
108
109
110
111
112
113
114
115
116
117 void
118 election_remove(pcmk_cluster_t *cluster, const char *uname)
119 {
120 if ((cluster != NULL) && (cluster->priv->election != NULL)
121 && (uname != NULL) && (cluster->priv->election->voted != NULL)) {
122 crm_trace("Discarding (no-)vote from lost peer %s", uname);
123 g_hash_table_remove(cluster->priv->election->voted, uname);
124 }
125 }
126
127
128
129
130
131
132
133 void
134 election_reset(pcmk_cluster_t *cluster)
135 {
136 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
137 crm_trace("Resetting election");
138 mainloop_timer_stop(cluster->priv->election->timeout);
139 if (cluster->priv->election->voted != NULL) {
140 g_hash_table_destroy(cluster->priv->election->voted);
141 cluster->priv->election->voted = NULL;
142 }
143 }
144 }
145
146
147
148
149
150
151
152
153
154
155 void
156 election_fini(pcmk_cluster_t *cluster)
157 {
158 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
159 election_reset(cluster);
160 crm_trace("Destroying election");
161 mainloop_timer_del(cluster->priv->election->timeout);
162 free(cluster->priv->election);
163 cluster->priv->election = NULL;
164 }
165 }
166
167 static void
168 election_timeout_start(pcmk_cluster_t *cluster)
169 {
170 mainloop_timer_start(cluster->priv->election->timeout);
171 }
172
173
174
175
176
177
178
179 void
180 election_timeout_stop(pcmk_cluster_t *cluster)
181 {
182 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
183 mainloop_timer_stop(cluster->priv->election->timeout);
184 }
185 }
186
187
188
189
190
191
192
193
194 void
195 election_timeout_set_period(pcmk_cluster_t *cluster, guint period)
196 {
197 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
198 mainloop_timer_set_period(cluster->priv->election->timeout, period);
199 }
200
201 static int
202 get_uptime(struct timeval *output)
203 {
204 static time_t expires = 0;
205 static struct rusage info;
206
207 time_t tm_now = time(NULL);
208
209 if (expires < tm_now) {
210 int rc = 0;
211
212 info.ru_utime.tv_sec = 0;
213 info.ru_utime.tv_usec = 0;
214 rc = getrusage(RUSAGE_SELF, &info);
215
216 output->tv_sec = 0;
217 output->tv_usec = 0;
218
219 if (rc < 0) {
220 crm_perror(LOG_ERR, "Could not calculate the current uptime");
221 expires = 0;
222 return -1;
223 }
224
225 crm_debug("Current CPU usage is: %lds, %ldus", (long)info.ru_utime.tv_sec,
226 (long)info.ru_utime.tv_usec);
227 }
228
229 expires = tm_now + STORM_INTERVAL;
230 output->tv_sec = info.ru_utime.tv_sec;
231 output->tv_usec = info.ru_utime.tv_usec;
232
233 return 1;
234 }
235
236 static int
237 compare_age(struct timeval your_age)
238 {
239 struct timeval our_age;
240
241 get_uptime(&our_age);
242
243 if (our_age.tv_sec > your_age.tv_sec) {
244 crm_debug("Win: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
245 return 1;
246 } else if (our_age.tv_sec < your_age.tv_sec) {
247 crm_debug("Lose: %ld vs %ld (seconds)", (long)our_age.tv_sec, (long)your_age.tv_sec);
248 return -1;
249 } else if (our_age.tv_usec > your_age.tv_usec) {
250 crm_debug("Win: %ld.%06ld vs %ld.%06ld (usec)",
251 (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
252 return 1;
253 } else if (our_age.tv_usec < your_age.tv_usec) {
254 crm_debug("Lose: %ld.%06ld vs %ld.%06ld (usec)",
255 (long)our_age.tv_sec, (long)our_age.tv_usec, (long)your_age.tv_sec, (long)your_age.tv_usec);
256 return -1;
257 }
258
259 return 0;
260 }
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277 void
278 election_vote(pcmk_cluster_t *cluster)
279 {
280 struct timeval age;
281 xmlNode *vote = NULL;
282 pcmk__node_status_t *our_node = NULL;
283 const char *message_type = NULL;
284
285 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL), return);
286
287 if (cluster->priv->node_name == NULL) {
288 crm_err("Cannot start an election: Local node name unknown");
289 return;
290 }
291
292 our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
293 pcmk__node_search_cluster_member);
294 if (!pcmk__cluster_is_node_active(our_node)) {
295 crm_trace("Cannot vote yet: local node not connected to cluster");
296 return;
297 }
298
299 election_reset(cluster);
300 cluster->priv->election->state = election_in_progress;
301 message_type = pcmk__server_message_type(cluster->priv->server);
302
303
304
305
306 vote = pcmk__new_request(cluster->priv->server, message_type,
307 NULL, message_type, CRM_OP_VOTE, NULL);
308
309 cluster->priv->election->count++;
310 crm_xml_add(vote, PCMK__XA_ELECTION_OWNER, our_node->xml_id);
311 crm_xml_add_int(vote, PCMK__XA_ELECTION_ID, cluster->priv->election->count);
312
313
314 get_uptime(&age);
315 crm_xml_add_timeval(vote, PCMK__XA_ELECTION_AGE_SEC,
316 PCMK__XA_ELECTION_AGE_NANO_SEC, &age);
317
318 pcmk__cluster_send_message(NULL, cluster->priv->server, vote);
319 pcmk__xml_free(vote);
320
321 crm_debug("Started election round %u", cluster->priv->election->count);
322 election_timeout_start(cluster);
323 return;
324 }
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342 bool
343 election_check(pcmk_cluster_t *cluster)
344 {
345 int voted_size = 0;
346 int num_members = 0;
347
348 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL),
349 return false);
350
351 if (cluster->priv->election->voted == NULL) {
352 crm_trace("Election check requested, but no votes received yet");
353 return FALSE;
354 }
355
356 voted_size = g_hash_table_size(cluster->priv->election->voted);
357 num_members = pcmk__cluster_num_active_nodes();
358
359
360
361
362
363 if (voted_size >= num_members) {
364
365 election_timeout_stop(cluster);
366 if (voted_size > num_members) {
367 GHashTableIter gIter;
368 const pcmk__node_status_t *node = NULL;
369 char *key = NULL;
370
371 crm_warn("Received too many votes in election");
372 g_hash_table_iter_init(&gIter, pcmk__peer_cache);
373 while (g_hash_table_iter_next(&gIter, NULL, (gpointer *) & node)) {
374 if (pcmk__cluster_is_node_active(node)) {
375 crm_warn("* expected vote: %s", node->name);
376 }
377 }
378
379 g_hash_table_iter_init(&gIter, cluster->priv->election->voted);
380 while (g_hash_table_iter_next(&gIter, (gpointer *) & key, NULL)) {
381 crm_warn("* actual vote: %s", key);
382 }
383
384 }
385
386 crm_info("Election won by local node");
387 election_complete(cluster);
388 return TRUE;
389
390 } else {
391 crm_debug("Election still waiting on %d of %d vote%s",
392 num_members - voted_size, num_members,
393 pcmk__plural_s(num_members));
394 }
395
396 return FALSE;
397 }
398
399 #define LOSS_DAMPEN 2
400
401 struct vote {
402 const char *op;
403 const char *from;
404 const char *version;
405 const char *election_owner;
406 int election_id;
407 struct timeval age;
408 };
409
410
411
412
413
414
415
416
417
418
419
420
421 static bool
422 parse_election_message(const xmlNode *message, struct vote *vote)
423 {
424 CRM_CHECK(message && vote, return FALSE);
425
426 vote->election_id = -1;
427 vote->age.tv_sec = -1;
428 vote->age.tv_usec = -1;
429
430 vote->op = crm_element_value(message, PCMK__XA_CRM_TASK);
431 vote->from = crm_element_value(message, PCMK__XA_SRC);
432 vote->version = crm_element_value(message, PCMK_XA_VERSION);
433 vote->election_owner = crm_element_value(message, PCMK__XA_ELECTION_OWNER);
434
435 crm_element_value_int(message, PCMK__XA_ELECTION_ID, &(vote->election_id));
436
437 if ((vote->op == NULL) || (vote->from == NULL) || (vote->version == NULL)
438 || (vote->election_owner == NULL) || (vote->election_id < 0)) {
439
440 crm_warn("Invalid %s message from %s",
441 pcmk__s(vote->op, "election"),
442 pcmk__s(vote->from, "unspecified node"));
443 crm_log_xml_trace(message, "bad-vote");
444 return FALSE;
445 }
446
447
448
449 if (pcmk__str_eq(vote->op, CRM_OP_VOTE, pcmk__str_none)) {
450
451
452
453 crm_element_value_timeval(message, PCMK__XA_ELECTION_AGE_SEC,
454 PCMK__XA_ELECTION_AGE_NANO_SEC, &(vote->age));
455 if ((vote->age.tv_sec < 0) || (vote->age.tv_usec < 0)) {
456 crm_warn("Cannot count election %s from %s "
457 "because it is missing uptime", vote->op, vote->from);
458 return FALSE;
459 }
460
461 } else if (!pcmk__str_eq(vote->op, CRM_OP_NOVOTE, pcmk__str_none)) {
462 crm_info("Cannot process election message from %s "
463 "because %s is not a known election op", vote->from, vote->op);
464 return FALSE;
465 }
466
467
468
469
470 if (pcmk__peer_cache == NULL) {
471 crm_info("Cannot count election %s from %s "
472 "because no peer information available", vote->op, vote->from);
473 return FALSE;
474 }
475 return TRUE;
476 }
477
478 static void
479 record_vote(pcmk_cluster_t *cluster, struct vote *vote)
480 {
481 pcmk__assert((vote->from != NULL) && (vote->op != NULL));
482
483 if (cluster->priv->election->voted == NULL) {
484 cluster->priv->election->voted = pcmk__strkey_table(free, free);
485 }
486 pcmk__insert_dup(cluster->priv->election->voted, vote->from, vote->op);
487 }
488
489 static void
490 send_no_vote(pcmk_cluster_t *cluster, pcmk__node_status_t *peer,
491 struct vote *vote)
492 {
493 const char *message_type = NULL;
494 xmlNode *novote = NULL;
495
496 message_type = pcmk__server_message_type(cluster->priv->server);
497 novote = pcmk__new_request(cluster->priv->server, message_type,
498 vote->from, message_type, CRM_OP_NOVOTE, NULL);
499 crm_xml_add(novote, PCMK__XA_ELECTION_OWNER, vote->election_owner);
500 crm_xml_add_int(novote, PCMK__XA_ELECTION_ID, vote->election_id);
501
502 pcmk__cluster_send_message(peer, cluster->priv->server, novote);
503 pcmk__xml_free(novote);
504 }
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522 enum election_result
523 election_count_vote(pcmk_cluster_t *cluster, const xmlNode *message,
524 bool can_win)
525 {
526 int log_level = LOG_INFO;
527 gboolean done = FALSE;
528 gboolean we_lose = FALSE;
529 const char *reason = "unknown";
530 bool we_are_owner = FALSE;
531 pcmk__node_status_t *our_node = NULL;
532 pcmk__node_status_t *your_node = NULL;
533 time_t tm_now = time(NULL);
534 struct vote vote;
535
536 CRM_CHECK((cluster != NULL) && (cluster->priv->election != NULL)
537 && (message != NULL) && (cluster->priv->node_name != NULL),
538 return election_error);
539
540 if (!parse_election_message(message, &vote)) {
541 return election_error;
542 }
543
544 your_node = pcmk__get_node(0, vote.from, NULL,
545 pcmk__node_search_cluster_member);
546 our_node = pcmk__get_node(0, cluster->priv->node_name, NULL,
547 pcmk__node_search_cluster_member);
548 we_are_owner = (our_node != NULL)
549 && pcmk__str_eq(our_node->xml_id, vote.election_owner,
550 pcmk__str_none);
551
552 if (!can_win) {
553 reason = "Not eligible";
554 we_lose = TRUE;
555
556 } else if (!pcmk__cluster_is_node_active(our_node)) {
557 reason = "We are not part of the cluster";
558 log_level = LOG_ERR;
559 we_lose = TRUE;
560
561 } else if (we_are_owner
562 && (vote.election_id != cluster->priv->election->count)) {
563 log_level = LOG_TRACE;
564 reason = "Superseded";
565 done = TRUE;
566
567 } else if (!pcmk__cluster_is_node_active(your_node)) {
568
569 reason = "Peer is not part of our cluster";
570 log_level = LOG_WARNING;
571 done = TRUE;
572
573 } else if (pcmk__str_eq(vote.op, CRM_OP_NOVOTE, pcmk__str_none)
574 || pcmk__str_eq(vote.from, cluster->priv->node_name,
575 pcmk__str_casei)) {
576
577
578
579 if (!we_are_owner) {
580 crm_warn("Cannot count election round %d %s from %s "
581 "because we did not start election (node ID %s did)",
582 vote.election_id, vote.op, vote.from,
583 vote.election_owner);
584 return election_error;
585 }
586 if (cluster->priv->election->state != election_in_progress) {
587
588 crm_debug("Not counting election round %d %s from %s "
589 "because no election in progress",
590 vote.election_id, vote.op, vote.from);
591 return cluster->priv->election->state;
592 }
593 record_vote(cluster, &vote);
594 reason = "Recorded";
595 done = TRUE;
596
597 } else {
598
599 int age_result = compare_age(vote.age);
600 int version_result = compare_version(vote.version, CRM_FEATURE_SET);
601
602 if (version_result < 0) {
603 reason = "Version";
604 we_lose = TRUE;
605
606 } else if (version_result > 0) {
607 reason = "Version";
608
609 } else if (age_result < 0) {
610 reason = "Uptime";
611 we_lose = TRUE;
612
613 } else if (age_result > 0) {
614 reason = "Uptime";
615
616 } else if (strcasecmp(cluster->priv->node_name, vote.from) > 0) {
617 reason = "Host name";
618 we_lose = TRUE;
619
620 } else {
621 reason = "Host name";
622 }
623 }
624
625 if (cluster->priv->election->expires < tm_now) {
626 cluster->priv->election->election_wins = 0;
627 cluster->priv->election->expires = tm_now + STORM_INTERVAL;
628
629 } else if (done == FALSE && we_lose == FALSE) {
630 int peers = 1 + g_hash_table_size(pcmk__peer_cache);
631
632
633
634
635 cluster->priv->election->election_wins++;
636 if (cluster->priv->election->election_wins > (peers * peers)) {
637 crm_warn("Election storm detected: %d wins in %d seconds",
638 cluster->priv->election->election_wins, STORM_INTERVAL);
639 cluster->priv->election->election_wins = 0;
640 cluster->priv->election->expires = tm_now + STORM_INTERVAL;
641 if (!(cluster->priv->election->wrote_blackbox)) {
642
643
644
645
646
647
648
649
650
651
652 crm_write_blackbox(0, NULL);
653 cluster->priv->election->wrote_blackbox = true;
654 }
655 }
656 }
657
658 if (done) {
659 do_crm_log(log_level + 1,
660 "Processed election round %u %s (current round %d) "
661 "from %s (%s)",
662 vote.election_id, vote.op, cluster->priv->election->count,
663 vote.from, reason);
664 return cluster->priv->election->state;
665
666 } else if (we_lose == FALSE) {
667
668
669
670
671
672
673
674
675
676
677
678
679
680 if ((cluster->priv->election->last_election_loss == 0)
681 || ((tm_now - cluster->priv->election->last_election_loss)
682 > (time_t) LOSS_DAMPEN)) {
683
684 do_crm_log(log_level,
685 "Election round %d (started by node ID %s) pass: "
686 "%s from %s (%s)",
687 vote.election_id, vote.election_owner, vote.op,
688 vote.from, reason);
689
690 cluster->priv->election->last_election_loss = 0;
691 election_timeout_stop(cluster);
692
693
694 cluster->priv->election->state = election_start;
695 return cluster->priv->election->state;
696 } else {
697 char *loss_time = NULL;
698
699 loss_time = ctime(&(cluster->priv->election->last_election_loss));
700 if (loss_time) {
701
702 loss_time += 11;
703 loss_time[8] = '\0';
704 }
705 crm_info("Ignoring election round %d (started by node ID %s) pass "
706 "vs %s because we lost less than %ds ago at %s",
707 vote.election_id, vote.election_owner, vote.from,
708 LOSS_DAMPEN, (loss_time? loss_time : "unknown"));
709 }
710 }
711
712 cluster->priv->election->last_election_loss = tm_now;
713
714 do_crm_log(log_level,
715 "Election round %d (started by node ID %s) lost: "
716 "%s from %s (%s)",
717 vote.election_id, vote.election_owner, vote.op,
718 vote.from, reason);
719
720 election_reset(cluster);
721 send_no_vote(cluster, your_node, &vote);
722 cluster->priv->election->state = election_lost;
723 return cluster->priv->election->state;
724 }
725
726
727
728
729
730
731
732 void
733 election_clear_dampening(pcmk_cluster_t *cluster)
734 {
735 if ((cluster != NULL) && (cluster->priv->election != NULL)) {
736 cluster->priv->election->last_election_loss = 0;
737 }
738 }