This source file includes following definitions.
- corosync_node_name
- terminate_cs_connection
- pcmk_quorum_dispatch
- pcmk_quorum_notification
- cluster_connect_quorum
- init_cs_connection
- init_cs_connection_once
- check_message_sanity
- find_corosync_variant
- crm_is_corosync_peer_active
- corosync_initialize_nodelist
- corosync_cluster_name
- corosync_cmap_has_config
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 #include <crm_internal.h>
20 #include <bzlib.h>
21 #include <sys/socket.h>
22 #include <netinet/in.h>
23 #include <arpa/inet.h>
24 #include <netdb.h>
25
26 #include <crm/common/ipc.h>
27 #include <crm/cluster/internal.h>
28 #include <crm/common/mainloop.h>
29 #include <sys/utsname.h>
30
31 #include <qb/qbipcc.h>
32 #include <qb/qbutil.h>
33
34 #include <corosync/corodefs.h>
35 #include <corosync/corotypes.h>
36 #include <corosync/hdb.h>
37 #include <corosync/cfg.h>
38 #include <corosync/cmap.h>
39 #include <corosync/quorum.h>
40
41 #include <crm/msg_xml.h>
42
43 quorum_handle_t pcmk_quorum_handle = 0;
44
45 gboolean(*quorum_app_callback) (unsigned long long seq, gboolean quorate) = NULL;
46
47
48
49
50
51 char *
52 corosync_node_name(uint64_t cmap_handle, uint32_t nodeid)
53 {
54 int lpc = 0;
55 int rc = CS_OK;
56 int retries = 0;
57 char *name = NULL;
58 cmap_handle_t local_handle = 0;
59
60
61 if (nodeid == 0) {
62 nodeid = get_local_nodeid(0);
63 }
64
65 if (cmap_handle == 0 && local_handle == 0) {
66 retries = 0;
67 crm_trace("Initializing CMAP connection");
68 do {
69 rc = cmap_initialize(&local_handle);
70 if (rc != CS_OK) {
71 retries++;
72 crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc),
73 retries);
74 sleep(retries);
75 }
76
77 } while (retries < 5 && rc != CS_OK);
78
79 if (rc != CS_OK) {
80 crm_warn("Could not connect to Cluster Configuration Database API, error %s",
81 cs_strerror(rc));
82 local_handle = 0;
83 }
84 }
85
86 if (cmap_handle == 0) {
87 cmap_handle = local_handle;
88 }
89
90 while (name == NULL && cmap_handle != 0) {
91 uint32_t id = 0;
92 char *key = NULL;
93
94 key = crm_strdup_printf("nodelist.node.%d.nodeid", lpc);
95 rc = cmap_get_uint32(cmap_handle, key, &id);
96 crm_trace("Checking %u vs %u from %s", nodeid, id, key);
97 free(key);
98
99 if (rc != CS_OK) {
100 break;
101 }
102
103 if (nodeid == id) {
104 crm_trace("Searching for node name for %u in nodelist.node.%d %s", nodeid, lpc, name);
105 if (name == NULL) {
106 key = crm_strdup_printf("nodelist.node.%d.ring0_addr", lpc);
107 cmap_get_string(cmap_handle, key, &name);
108 crm_trace("%s = %s", key, name);
109
110 if (node_name_is_valid(key, name) == FALSE) {
111 free(name);
112 name = NULL;
113 }
114 free(key);
115 }
116
117 if (name == NULL) {
118 key = crm_strdup_printf("nodelist.node.%d.name", lpc);
119 cmap_get_string(cmap_handle, key, &name);
120 crm_trace("%s = %s %d", key, name, rc);
121 free(key);
122 }
123 break;
124 }
125
126 lpc++;
127 }
128
129 if(local_handle) {
130 cmap_finalize(local_handle);
131 }
132
133 if (name == NULL) {
134 crm_info("Unable to get node name for nodeid %u", nodeid);
135 }
136 return name;
137 }
138
139 void
140 terminate_cs_connection(crm_cluster_t *cluster)
141 {
142 crm_info("Disconnecting from Corosync");
143
144 cluster_disconnect_cpg(cluster);
145
146 if (pcmk_quorum_handle) {
147 crm_trace("Disconnecting quorum");
148 quorum_finalize(pcmk_quorum_handle);
149 pcmk_quorum_handle = 0;
150
151 } else {
152 crm_info("No Quorum connection");
153 }
154
155 crm_notice("Disconnected from Corosync");
156 }
157
158 int ais_membership_timer = 0;
159 gboolean ais_membership_force = FALSE;
160
161
162 static int
163 pcmk_quorum_dispatch(gpointer user_data)
164 {
165 int rc = 0;
166
167 rc = quorum_dispatch(pcmk_quorum_handle, CS_DISPATCH_ALL);
168 if (rc < 0) {
169 crm_err("Connection to the Quorum API failed: %d", rc);
170 pcmk_quorum_handle = 0;
171 return -1;
172 }
173 return 0;
174 }
175
176 static void
177 pcmk_quorum_notification(quorum_handle_t handle,
178 uint32_t quorate,
179 uint64_t ring_id, uint32_t view_list_entries, uint32_t * view_list)
180 {
181 int i;
182 GHashTableIter iter;
183 crm_node_t *node = NULL;
184 static gboolean init_phase = TRUE;
185
186 if (quorate != crm_have_quorum) {
187 if (quorate) {
188 crm_notice("Quorum acquired " CRM_XS " membership=" U64T " members=%lu",
189 ring_id, (long unsigned int)view_list_entries);
190 } else {
191 crm_warn("Quorum lost " CRM_XS " membership=" U64T " members=%lu",
192 ring_id, (long unsigned int)view_list_entries);
193 }
194 crm_have_quorum = quorate;
195
196 } else {
197 crm_info("Quorum %s " CRM_XS " membership=" U64T " members=%lu",
198 (quorate? "retained" : "still lost"), ring_id,
199 (long unsigned int)view_list_entries);
200 }
201
202 if (view_list_entries == 0 && init_phase) {
203 crm_info("Corosync membership is still forming, ignoring");
204 return;
205 }
206
207 init_phase = FALSE;
208
209
210
211 g_hash_table_iter_init(&iter, crm_peer_cache);
212 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
213 node->last_seen = 0;
214 }
215
216
217 for (i = 0; i < view_list_entries; i++) {
218 uint32_t id = view_list[i];
219
220 crm_debug("Member[%d] %u ", i, id);
221
222
223 node = crm_get_peer(id, NULL);
224 if (node->uname == NULL) {
225 char *name = corosync_node_name(0, id);
226
227 crm_info("Obtaining name for new node %u", id);
228 node = crm_get_peer(id, name);
229 free(name);
230 }
231
232
233 crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, ring_id);
234 }
235
236
237 crm_reap_unseen_nodes(ring_id);
238
239 if (quorum_app_callback) {
240 quorum_app_callback(ring_id, quorate);
241 }
242 }
243
244 quorum_callbacks_t quorum_callbacks = {
245 .quorum_notify_fn = pcmk_quorum_notification,
246 };
247
248 gboolean
249 cluster_connect_quorum(gboolean(*dispatch) (unsigned long long, gboolean),
250 void (*destroy) (gpointer))
251 {
252 int rc = -1;
253 int fd = 0;
254 int quorate = 0;
255 uint32_t quorum_type = 0;
256 struct mainloop_fd_callbacks quorum_fd_callbacks;
257
258 quorum_fd_callbacks.dispatch = pcmk_quorum_dispatch;
259 quorum_fd_callbacks.destroy = destroy;
260
261 crm_debug("Configuring Pacemaker to obtain quorum from Corosync");
262
263 rc = quorum_initialize(&pcmk_quorum_handle, &quorum_callbacks, &quorum_type);
264 if (rc != CS_OK) {
265 crm_err("Could not connect to the Quorum API: %d", rc);
266 goto bail;
267
268 } else if (quorum_type != QUORUM_SET) {
269 crm_err("Corosync quorum is not configured");
270 goto bail;
271 }
272
273 rc = quorum_getquorate(pcmk_quorum_handle, &quorate);
274 if (rc != CS_OK) {
275 crm_err("Could not obtain the current Quorum API state: %d", rc);
276 goto bail;
277 }
278
279 if (quorate) {
280 crm_notice("Quorum acquired");
281 } else {
282 crm_warn("Quorum lost");
283 }
284 quorum_app_callback = dispatch;
285 crm_have_quorum = quorate;
286
287 rc = quorum_trackstart(pcmk_quorum_handle, CS_TRACK_CHANGES | CS_TRACK_CURRENT);
288 if (rc != CS_OK) {
289 crm_err("Could not setup Quorum API notifications: %d", rc);
290 goto bail;
291 }
292
293 rc = quorum_fd_get(pcmk_quorum_handle, &fd);
294 if (rc != CS_OK) {
295 crm_err("Could not obtain the Quorum API connection: %d", rc);
296 goto bail;
297 }
298
299 mainloop_add_fd("quorum", G_PRIORITY_HIGH, fd, dispatch, &quorum_fd_callbacks);
300
301 corosync_initialize_nodelist(NULL, FALSE, NULL);
302
303 bail:
304 if (rc != CS_OK) {
305 quorum_finalize(pcmk_quorum_handle);
306 return FALSE;
307 }
308 return TRUE;
309 }
310
311 gboolean
312 init_cs_connection(crm_cluster_t * cluster)
313 {
314 int retries = 0;
315
316 while (retries < 5) {
317 int rc = init_cs_connection_once(cluster);
318
319 retries++;
320
321 switch (rc) {
322 case CS_OK:
323 return TRUE;
324 break;
325 case CS_ERR_TRY_AGAIN:
326 case CS_ERR_QUEUE_FULL:
327 sleep(retries);
328 break;
329 default:
330 return FALSE;
331 }
332 }
333
334 crm_err("Could not connect to corosync after %d retries", retries);
335 return FALSE;
336 }
337
338 gboolean
339 init_cs_connection_once(crm_cluster_t * cluster)
340 {
341 crm_node_t *peer = NULL;
342 enum cluster_type_e stack = get_cluster_type();
343
344 crm_peer_init();
345
346
347 if (stack != pcmk_cluster_corosync) {
348 crm_err("Invalid cluster type: %s (%d)", name_for_cluster_type(stack), stack);
349 return FALSE;
350 }
351
352 if (cluster_connect_cpg(cluster) == FALSE) {
353 return FALSE;
354 }
355 crm_info("Connection to '%s': established", name_for_cluster_type(stack));
356
357 cluster->nodeid = get_local_nodeid(0);
358 if(cluster->nodeid == 0) {
359 crm_err("Could not establish local nodeid");
360 return FALSE;
361 }
362
363 cluster->uname = get_node_name(0);
364 if(cluster->uname == NULL) {
365 crm_err("Could not establish local node name");
366 return FALSE;
367 }
368
369
370 peer = crm_get_peer(cluster->nodeid, cluster->uname);
371 cluster->uuid = get_corosync_uuid(peer);
372
373 return TRUE;
374 }
375
376 gboolean
377 check_message_sanity(const AIS_Message * msg, const char *data)
378 {
379 gboolean sane = TRUE;
380 int dest = msg->host.type;
381 int tmp_size = msg->header.size - sizeof(AIS_Message);
382
383 if (sane && msg->header.size == 0) {
384 crm_warn("Message with no size");
385 sane = FALSE;
386 }
387
388 if (sane && msg->header.error != CS_OK) {
389 crm_warn("Message header contains an error: %d", msg->header.error);
390 sane = FALSE;
391 }
392
393 if (sane && ais_data_len(msg) != tmp_size) {
394 crm_warn("Message payload size is incorrect: expected %d, got %d", ais_data_len(msg),
395 tmp_size);
396 sane = TRUE;
397 }
398
399 if (sane && ais_data_len(msg) == 0) {
400 crm_warn("Message with no payload");
401 sane = FALSE;
402 }
403
404 if (sane && data && msg->is_compressed == FALSE) {
405 int str_size = strlen(data) + 1;
406
407 if (ais_data_len(msg) != str_size) {
408 int lpc = 0;
409
410 crm_warn("Message payload is corrupted: expected %d bytes, got %d",
411 ais_data_len(msg), str_size);
412 sane = FALSE;
413 for (lpc = (str_size - 10); lpc < msg->size; lpc++) {
414 if (lpc < 0) {
415 lpc = 0;
416 }
417 crm_debug("bad_data[%d]: %d / '%c'", lpc, data[lpc], data[lpc]);
418 }
419 }
420 }
421
422 if (sane == FALSE) {
423 crm_err("Invalid message %d: (dest=%s:%s, from=%s:%s.%u, compressed=%d, size=%d, total=%d)",
424 msg->id, ais_dest(&(msg->host)), msg_type2text(dest),
425 ais_dest(&(msg->sender)), msg_type2text(msg->sender.type),
426 msg->sender.pid, msg->is_compressed, ais_data_len(msg), msg->header.size);
427
428 } else {
429 crm_trace
430 ("Verified message %d: (dest=%s:%s, from=%s:%s.%u, compressed=%d, size=%d, total=%d)",
431 msg->id, ais_dest(&(msg->host)), msg_type2text(dest), ais_dest(&(msg->sender)),
432 msg_type2text(msg->sender.type), msg->sender.pid, msg->is_compressed,
433 ais_data_len(msg), msg->header.size);
434 }
435
436 return sane;
437 }
438
439 enum cluster_type_e
440 find_corosync_variant(void)
441 {
442 int rc = CS_OK;
443 cmap_handle_t handle;
444
445 rc = cmap_initialize(&handle);
446
447 switch(rc) {
448 case CS_OK:
449 break;
450 case CS_ERR_SECURITY:
451 crm_debug("Failed to initialize the cmap API: Permission denied (%d)", rc);
452
453
454
455 return pcmk_cluster_corosync;
456
457 default:
458 crm_info("Failed to initialize the cmap API: %s (%d)",
459 ais_error2text(rc), rc);
460 return pcmk_cluster_unknown;
461 }
462
463 cmap_finalize(handle);
464 return pcmk_cluster_corosync;
465 }
466
467 gboolean
468 crm_is_corosync_peer_active(const crm_node_t * node)
469 {
470 if (node == NULL) {
471 crm_trace("NULL");
472 return FALSE;
473
474 } else if (safe_str_neq(node->state, CRM_NODE_MEMBER)) {
475 crm_trace("%s: state=%s", node->uname, node->state);
476 return FALSE;
477
478 } else if ((node->processes & crm_proc_cpg) == 0) {
479 crm_trace("%s: processes=%.16x", node->uname, node->processes);
480 return FALSE;
481 }
482 return TRUE;
483 }
484
485 gboolean
486 corosync_initialize_nodelist(void *cluster, gboolean force_member, xmlNode * xml_parent)
487 {
488 int lpc = 0;
489 int rc = CS_OK;
490 int retries = 0;
491 gboolean any = FALSE;
492 cmap_handle_t cmap_handle;
493
494 do {
495 rc = cmap_initialize(&cmap_handle);
496 if (rc != CS_OK) {
497 retries++;
498 crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc),
499 retries);
500 sleep(retries);
501 }
502
503 } while (retries < 5 && rc != CS_OK);
504
505 if (rc != CS_OK) {
506 crm_warn("Could not connect to Cluster Configuration Database API, error %d", rc);
507 return FALSE;
508 }
509
510 crm_peer_init();
511 crm_trace("Initializing corosync nodelist");
512 for (lpc = 0; TRUE; lpc++) {
513 uint32_t nodeid = 0;
514 char *name = NULL;
515 char *key = NULL;
516
517 key = crm_strdup_printf("nodelist.node.%d.nodeid", lpc);
518 rc = cmap_get_uint32(cmap_handle, key, &nodeid);
519 free(key);
520
521 if (rc != CS_OK) {
522 break;
523 }
524
525 name = corosync_node_name(cmap_handle, nodeid);
526 if (name != NULL) {
527 GHashTableIter iter;
528 crm_node_t *node = NULL;
529
530 g_hash_table_iter_init(&iter, crm_peer_cache);
531 while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &node)) {
532 if(node && node->uname && strcasecmp(node->uname, name) == 0) {
533 if (node->id && node->id != nodeid) {
534 crm_crit("Nodes %u and %u share the same name '%s': shutting down", node->id,
535 nodeid, name);
536 crm_exit(DAEMON_RESPAWN_STOP);
537 }
538 }
539 }
540 }
541
542 if (nodeid > 0 || name != NULL) {
543 crm_trace("Initializing node[%d] %u = %s", lpc, nodeid, name);
544 crm_get_peer(nodeid, name);
545 }
546
547 if (nodeid > 0 && name != NULL) {
548 any = TRUE;
549
550 if (xml_parent) {
551 xmlNode *node = create_xml_node(xml_parent, XML_CIB_TAG_NODE);
552
553 crm_xml_set_id(node, "%u", nodeid);
554 crm_xml_add(node, XML_ATTR_UNAME, name);
555 if (force_member) {
556 crm_xml_add(node, XML_ATTR_TYPE, CRM_NODE_MEMBER);
557 }
558 }
559 }
560
561 free(name);
562 }
563 cmap_finalize(cmap_handle);
564 return any;
565 }
566
567 char *
568 corosync_cluster_name(void)
569 {
570 cmap_handle_t handle;
571 char *cluster_name = NULL;
572 int rc = CS_OK;
573
574 rc = cmap_initialize(&handle);
575 if (rc != CS_OK) {
576 crm_info("Failed to initialize the cmap API: %s (%d)", ais_error2text(rc), rc);
577 return NULL;
578 }
579
580 rc = cmap_get_string(handle, "totem.cluster_name", &cluster_name);
581 if (rc != CS_OK) {
582 crm_info("Cannot get totem.cluster_name: %s (%d)", ais_error2text(rc), rc);
583
584 } else {
585 crm_debug("cmap totem.cluster_name = '%s'", cluster_name);
586 }
587
588 cmap_finalize(handle);
589
590 return cluster_name;
591 }
592
593 int
594 corosync_cmap_has_config(const char *prefix)
595 {
596 int rc = CS_OK;
597 int retries = 0;
598 static int found = -1;
599 cmap_handle_t cmap_handle;
600 cmap_iter_handle_t iter_handle;
601 char key_name[CMAP_KEYNAME_MAXLEN + 1];
602
603 if(found != -1) {
604 return found;
605 }
606
607 do {
608 rc = cmap_initialize(&cmap_handle);
609 if (rc != CS_OK) {
610 retries++;
611 crm_debug("API connection setup failed: %s. Retrying in %ds", cs_strerror(rc),
612 retries);
613 sleep(retries);
614 }
615
616 } while (retries < 5 && rc != CS_OK);
617
618 if (rc != CS_OK) {
619 crm_warn("Could not connect to Cluster Configuration Database API: %s (rc=%d)",
620 cs_strerror(rc), rc);
621 return -1;
622 }
623
624 rc = cmap_iter_init(cmap_handle, prefix, &iter_handle);
625 if (rc != CS_OK) {
626 crm_warn("Failed to initialize iteration for corosync cmap '%s': %s (rc=%d)",
627 prefix, cs_strerror(rc), rc);
628 goto bail;
629 }
630
631 found = 0;
632 while ((rc = cmap_iter_next(cmap_handle, iter_handle, key_name, NULL, NULL)) == CS_OK) {
633 crm_trace("'%s' is configured in corosync cmap: %s", prefix, key_name);
634 found++;
635 break;
636 }
637 cmap_iter_finalize(cmap_handle, iter_handle);
638
639 bail:
640 cmap_finalize(cmap_handle);
641
642 return found;
643 }