This source file includes following definitions.
- free_cmd
- generate_callid
- recurring_helper
- start_delay_helper
- remote_node_up
- remote_node_down
- check_remote_node_state
- report_remote_ra_result
- update_remaining_timeout
- retry_start_cmd_cb
- connection_takeover_timeout_cb
- monitor_timeout_cb
- synthesize_lrmd_success
- remote_lrm_op_callback
- handle_remote_ra_stop
- handle_remote_ra_start
- handle_remote_ra_exec
- remote_ra_data_init
- remote_ra_cleanup
- is_remote_lrmd_ra
- remote_ra_get_rsc_info
- is_remote_ra_supported_action
- fail_all_monitor_cmds
- remove_cmd
- remote_ra_cancel
- handle_dup_monitor
- remote_ra_exec
- remote_ra_fail
- remote_ra_process_pseudo
- remote_ra_maintenance
- remote_ra_process_maintenance_nodes
- remote_ra_is_in_maintenance
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 #include <crm_internal.h>
20 #include <crm/crm.h>
21 #include <crm/msg_xml.h>
22
23 #include <crmd.h>
24 #include <crmd_fsa.h>
25 #include <crmd_messages.h>
26 #include <crmd_callbacks.h>
27 #include <crmd_lrm.h>
28 #include <crm/lrmd.h>
29 #include <crm/services.h>
30
31 #define REMOTE_LRMD_RA "remote"
32
33
34 #define MAX_START_TIMEOUT_MS 10000
35
36 typedef struct remote_ra_cmd_s {
37
38 char *owner;
39
40 char *rsc_id;
41
42 char *action;
43
44 char *userdata;
45 char *exit_reason;
46
47 int start_delay;
48
49 int delay_id;
50
51 int timeout;
52 int remaining_timeout;
53
54 int interval;
55
56 int interval_id;
57 int reported_success;
58 int monitor_timeout_id;
59 int takeover_timeout_id;
60
61 lrmd_key_value_t *params;
62
63 int rc;
64 int op_status;
65 int call_id;
66 time_t start_time;
67 gboolean cancel;
68 } remote_ra_cmd_t;
69
70 enum remote_migration_status {
71 expect_takeover = 1,
72 takeover_complete,
73 };
74
75 typedef struct remote_ra_data_s {
76 crm_trigger_t *work;
77 remote_ra_cmd_t *cur_cmd;
78 GList *cmds;
79 GList *recurring_cmds;
80
81 enum remote_migration_status migrate_status;
82
83 gboolean active;
84 gboolean is_maintenance;
85
86
87
88 } remote_ra_data_t;
89
90 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
91 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
92 static GList *fail_all_monitor_cmds(GList * list);
93
94 static void
95 free_cmd(gpointer user_data)
96 {
97 remote_ra_cmd_t *cmd = user_data;
98
99 if (!cmd) {
100 return;
101 }
102 if (cmd->delay_id) {
103 g_source_remove(cmd->delay_id);
104 }
105 if (cmd->interval_id) {
106 g_source_remove(cmd->interval_id);
107 }
108 if (cmd->monitor_timeout_id) {
109 g_source_remove(cmd->monitor_timeout_id);
110 }
111 if (cmd->takeover_timeout_id) {
112 g_source_remove(cmd->takeover_timeout_id);
113 }
114 free(cmd->owner);
115 free(cmd->rsc_id);
116 free(cmd->action);
117 free(cmd->userdata);
118 free(cmd->exit_reason);
119 lrmd_key_value_freeall(cmd->params);
120 free(cmd);
121 }
122
123 static int
124 generate_callid(void)
125 {
126 static int remote_ra_callid = 0;
127
128 remote_ra_callid++;
129 if (remote_ra_callid <= 0) {
130 remote_ra_callid = 1;
131 }
132
133 return remote_ra_callid;
134 }
135
136 static gboolean
137 recurring_helper(gpointer data)
138 {
139 remote_ra_cmd_t *cmd = data;
140 lrm_state_t *connection_rsc = NULL;
141
142 cmd->interval_id = 0;
143 connection_rsc = lrm_state_find(cmd->rsc_id);
144 if (connection_rsc && connection_rsc->remote_ra_data) {
145 remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
146
147 ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
148
149 ra_data->cmds = g_list_append(ra_data->cmds, cmd);
150 mainloop_set_trigger(ra_data->work);
151 }
152 return FALSE;
153 }
154
155 static gboolean
156 start_delay_helper(gpointer data)
157 {
158 remote_ra_cmd_t *cmd = data;
159 lrm_state_t *connection_rsc = NULL;
160
161 cmd->delay_id = 0;
162 connection_rsc = lrm_state_find(cmd->rsc_id);
163 if (connection_rsc && connection_rsc->remote_ra_data) {
164 remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
165
166 mainloop_set_trigger(ra_data->work);
167 }
168 return FALSE;
169 }
170
171
172
173
174
175
176
177 static void
178 remote_node_up(const char *node_name)
179 {
180 int call_opt, call_id = 0;
181 xmlNode *update, *state;
182 crm_node_t *node;
183
184 CRM_CHECK(node_name != NULL, return);
185 crm_info("Announcing pacemaker_remote node %s", node_name);
186
187
188
189
190
191 call_opt = crmd_cib_smart_opt();
192 erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
193 erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
194
195
196 update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
197
198
199 node = crm_remote_peer_get(node_name);
200 CRM_CHECK(node != NULL, return);
201 crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0);
202
203
204
205
206
207
208
209 send_remote_state_message(node_name, TRUE);
210
211 update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
212 state = create_node_state_update(node, node_update_cluster, update,
213 __FUNCTION__);
214
215
216
217
218
219 crm_xml_add(state, XML_NODE_IS_FENCED, "0");
220
221
222
223
224
225
226
227
228 fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
229 if (call_id < 0) {
230 crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
231 }
232 free_xml(update);
233 }
234
235 enum down_opts {
236 DOWN_KEEP_LRM,
237 DOWN_ERASE_LRM
238 };
239
240
241
242
243
244
245
246
247 static void
248 remote_node_down(const char *node_name, const enum down_opts opts)
249 {
250 xmlNode *update;
251 int call_id = 0;
252 int call_opt = crmd_cib_smart_opt();
253 crm_node_t *node;
254
255
256 update_attrd_remote_node_removed(node_name, NULL);
257
258
259 erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
260
261
262
263
264
265 if (opts == DOWN_ERASE_LRM) {
266 erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
267 }
268
269
270 node = crm_remote_peer_get(node_name);
271 CRM_CHECK(node != NULL, return);
272 crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0);
273
274
275 send_remote_state_message(node_name, FALSE);
276
277
278 update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
279 create_node_state_update(node, node_update_cluster, update, __FUNCTION__);
280 fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
281 if (call_id < 0) {
282 crm_perror(LOG_ERR, "%s CIB node state update", node_name);
283 }
284 free_xml(update);
285 }
286
287
288
289
290
291
292
293 static void
294 check_remote_node_state(remote_ra_cmd_t *cmd)
295 {
296
297 if (cmd->rc != PCMK_OCF_OK) {
298 return;
299 }
300
301 if (safe_str_eq(cmd->action, "start")) {
302 remote_node_up(cmd->rsc_id);
303
304 } else if (safe_str_eq(cmd->action, "migrate_from")) {
305
306
307
308
309
310
311
312 crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
313
314 CRM_CHECK(node != NULL, return);
315 crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0);
316
317 } else if (safe_str_eq(cmd->action, "stop")) {
318 lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
319 remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
320
321 if (ra_data) {
322 if (ra_data->migrate_status != takeover_complete) {
323
324 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
325 } else if (AM_I_DC == FALSE) {
326
327
328
329
330 crm_remote_peer_cache_remove(cmd->rsc_id);
331 }
332 }
333 }
334
335
336
337
338
339
340
341
342
343
344 }
345
346 static void
347 report_remote_ra_result(remote_ra_cmd_t * cmd)
348 {
349 lrmd_event_data_t op = { 0, };
350
351 check_remote_node_state(cmd);
352
353 op.type = lrmd_event_exec_complete;
354 op.rsc_id = cmd->rsc_id;
355 op.op_type = cmd->action;
356 op.user_data = cmd->userdata;
357 op.exit_reason = cmd->exit_reason;
358 op.timeout = cmd->timeout;
359 op.interval = cmd->interval;
360 op.rc = cmd->rc;
361 op.op_status = cmd->op_status;
362 op.t_run = cmd->start_time;
363 op.t_rcchange = cmd->start_time;
364 if (cmd->reported_success && cmd->rc != PCMK_OCF_OK) {
365 op.t_rcchange = time(NULL);
366
367
368
369
370
371
372
373
374
375 if (op.t_rcchange == op.t_run) {
376 op.t_rcchange++;
377 }
378 }
379
380 if (cmd->params) {
381 lrmd_key_value_t *tmp;
382
383 op.params = crm_str_table_new();
384 for (tmp = cmd->params; tmp; tmp = tmp->next) {
385 g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
386 }
387
388 }
389 op.call_id = cmd->call_id;
390 op.remote_nodename = cmd->owner;
391
392 lrm_op_callback(&op);
393
394 if (op.params) {
395 g_hash_table_destroy(op.params);
396 }
397 }
398
399 static void
400 update_remaining_timeout(remote_ra_cmd_t * cmd)
401 {
402 cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
403 }
404
405 static gboolean
406 retry_start_cmd_cb(gpointer data)
407 {
408 lrm_state_t *lrm_state = data;
409 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
410 remote_ra_cmd_t *cmd = NULL;
411 int rc = -1;
412
413 if (!ra_data || !ra_data->cur_cmd) {
414 return FALSE;
415 }
416 cmd = ra_data->cur_cmd;
417 if (safe_str_neq(cmd->action, "start") && safe_str_neq(cmd->action, "migrate_from")) {
418 return FALSE;
419 }
420 update_remaining_timeout(cmd);
421
422 if (cmd->remaining_timeout > 0) {
423 rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
424 }
425
426 if (rc != 0) {
427 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
428 cmd->op_status = PCMK_LRM_OP_ERROR;
429 report_remote_ra_result(cmd);
430
431 if (ra_data->cmds) {
432 mainloop_set_trigger(ra_data->work);
433 }
434 ra_data->cur_cmd = NULL;
435 free_cmd(cmd);
436 } else {
437
438 }
439
440 return FALSE;
441 }
442
443
444 static gboolean
445 connection_takeover_timeout_cb(gpointer data)
446 {
447 lrm_state_t *lrm_state = NULL;
448 remote_ra_cmd_t *cmd = data;
449
450 crm_info("takeover event timed out for node %s", cmd->rsc_id);
451 cmd->takeover_timeout_id = 0;
452
453 lrm_state = lrm_state_find(cmd->rsc_id);
454
455 handle_remote_ra_stop(lrm_state, cmd);
456 free_cmd(cmd);
457
458 return FALSE;
459 }
460
461 static gboolean
462 monitor_timeout_cb(gpointer data)
463 {
464 lrm_state_t *lrm_state = NULL;
465 remote_ra_cmd_t *cmd = data;
466
467 lrm_state = lrm_state_find(cmd->rsc_id);
468
469 crm_info("Poke async response timed out for node %s (%p)", cmd->rsc_id, lrm_state);
470 cmd->monitor_timeout_id = 0;
471 cmd->op_status = PCMK_LRM_OP_TIMEOUT;
472 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
473
474 if (lrm_state && lrm_state->remote_ra_data) {
475 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
476
477 if (ra_data->cur_cmd == cmd) {
478 ra_data->cur_cmd = NULL;
479 }
480 if (ra_data->cmds) {
481 mainloop_set_trigger(ra_data->work);
482 }
483 }
484
485 report_remote_ra_result(cmd);
486 free_cmd(cmd);
487
488 if(lrm_state) {
489 lrm_state_disconnect(lrm_state);
490 }
491 return FALSE;
492 }
493
494 static void
495 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
496 {
497 lrmd_event_data_t op = { 0, };
498
499 if (lrm_state == NULL) {
500
501 lrm_state = lrm_state_find(fsa_our_uname);
502 }
503 CRM_ASSERT(lrm_state != NULL);
504
505 op.type = lrmd_event_exec_complete;
506 op.rsc_id = rsc_id;
507 op.op_type = op_type;
508 op.rc = PCMK_OCF_OK;
509 op.op_status = PCMK_LRM_OP_DONE;
510 op.t_run = time(NULL);
511 op.t_rcchange = op.t_run;
512 op.call_id = generate_callid();
513 process_lrm_event(lrm_state, &op, NULL);
514 }
515
516 void
517 remote_lrm_op_callback(lrmd_event_data_t * op)
518 {
519 gboolean cmd_handled = FALSE;
520 lrm_state_t *lrm_state = NULL;
521 remote_ra_data_t *ra_data = NULL;
522 remote_ra_cmd_t *cmd = NULL;
523
524 crm_debug("remote connection event - event_type:%s node:%s action:%s rc:%s op_status:%s",
525 lrmd_event_type2str(op->type),
526 op->remote_nodename,
527 op->op_type ? op->op_type : "none",
528 services_ocf_exitcode_str(op->rc), services_lrm_status_str(op->op_status));
529
530 lrm_state = lrm_state_find(op->remote_nodename);
531 if (!lrm_state || !lrm_state->remote_ra_data) {
532 crm_debug("lrm_state info not found for remote lrmd connection event");
533 return;
534 }
535 ra_data = lrm_state->remote_ra_data;
536
537
538
539 if (op->type == lrmd_event_new_client) {
540
541 if (ra_data->migrate_status == expect_takeover) {
542 ra_data->migrate_status = takeover_complete;
543 } else {
544 crm_err("Unexpected pacemaker_remote client takeover for %s. Disconnecting", op->remote_nodename);
545
546
547
548 lrm_state_disconnect_only(lrm_state);
549 }
550 return;
551 }
552
553
554 if (op->type == lrmd_event_exec_complete) {
555 if (ra_data->migrate_status == takeover_complete) {
556 crm_debug("ignoring event, this connection is taken over by another node");
557 } else {
558 lrm_op_callback(op);
559 }
560 return;
561 }
562
563 if ((op->type == lrmd_event_disconnect) &&
564 (ra_data->cur_cmd == NULL) &&
565 (ra_data->active == TRUE)) {
566
567 if (!remote_ra_is_in_maintenance(lrm_state)) {
568 crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name);
569 ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
570 ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
571 } else {
572 crm_notice("Disconnect on unmanaged remote-node %s", lrm_state->node_name);
573
574 handle_remote_ra_stop(lrm_state, NULL);
575 remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
576
577 synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
578 }
579 return;
580 }
581
582 if (!ra_data->cur_cmd) {
583 crm_debug("no event to match");
584 return;
585 }
586
587 cmd = ra_data->cur_cmd;
588
589
590
591 if (op->type == lrmd_event_connect && (safe_str_eq(cmd->action, "start") ||
592 safe_str_eq(cmd->action, "migrate_from"))) {
593
594 if (op->connection_rc < 0) {
595 update_remaining_timeout(cmd);
596
597 if (op->connection_rc == -ENOKEY) {
598
599 cmd->op_status = PCMK_LRM_OP_ERROR;
600 cmd->rc = PCMK_OCF_INVALID_PARAM;
601 cmd->exit_reason = strdup("Authentication key not readable");
602
603 } else if (cmd->remaining_timeout > 3000) {
604 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
605 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
606 return;
607
608 } else {
609 crm_trace("can't reschedule start, remaining timeout too small %d",
610 cmd->remaining_timeout);
611 cmd->op_status = PCMK_LRM_OP_TIMEOUT;
612 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
613 }
614
615 } else {
616 lrm_state_reset_tables(lrm_state, TRUE);
617 cmd->rc = PCMK_OCF_OK;
618 cmd->op_status = PCMK_LRM_OP_DONE;
619 ra_data->active = TRUE;
620 }
621
622 crm_debug("remote lrmd connect event matched %s action. ", cmd->action);
623 report_remote_ra_result(cmd);
624 cmd_handled = TRUE;
625
626 } else if (op->type == lrmd_event_poke && safe_str_eq(cmd->action, "monitor")) {
627
628 if (cmd->monitor_timeout_id) {
629 g_source_remove(cmd->monitor_timeout_id);
630 cmd->monitor_timeout_id = 0;
631 }
632
633
634
635
636 if (!cmd->reported_success) {
637 cmd->rc = PCMK_OCF_OK;
638 cmd->op_status = PCMK_LRM_OP_DONE;
639 report_remote_ra_result(cmd);
640 cmd->reported_success = 1;
641 }
642
643 crm_debug("remote lrmd poke event matched %s action. ", cmd->action);
644
645
646 if (cmd->interval && (cmd->cancel == FALSE)) {
647 ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
648 cmd->interval_id = g_timeout_add(cmd->interval, recurring_helper, cmd);
649 cmd = NULL;
650 }
651 cmd_handled = TRUE;
652
653 } else if (op->type == lrmd_event_disconnect && safe_str_eq(cmd->action, "monitor")) {
654 if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
655 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
656 cmd->op_status = PCMK_LRM_OP_ERROR;
657 report_remote_ra_result(cmd);
658 crm_err("remote-node %s unexpectedly disconneced during monitor operation", lrm_state->node_name);
659 }
660 cmd_handled = TRUE;
661
662 } else if (op->type == lrmd_event_new_client && safe_str_eq(cmd->action, "stop")) {
663
664 handle_remote_ra_stop(lrm_state, cmd);
665 cmd_handled = TRUE;
666
667 } else {
668 crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
669 }
670
671 if (cmd_handled) {
672 ra_data->cur_cmd = NULL;
673 if (ra_data->cmds) {
674 mainloop_set_trigger(ra_data->work);
675 }
676 free_cmd(cmd);
677 }
678 }
679
680 static void
681 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
682 {
683 remote_ra_data_t *ra_data = NULL;
684
685 CRM_ASSERT(lrm_state);
686 ra_data = lrm_state->remote_ra_data;
687
688 if (ra_data->migrate_status != takeover_complete) {
689
690 g_hash_table_remove_all(lrm_state->pending_ops);
691 } else {
692
693
694 lrm_state_reset_tables(lrm_state, FALSE);
695 }
696
697 ra_data->active = FALSE;
698 lrm_state_disconnect(lrm_state);
699
700 if (ra_data->cmds) {
701 g_list_free_full(ra_data->cmds, free_cmd);
702 }
703 if (ra_data->recurring_cmds) {
704 g_list_free_full(ra_data->recurring_cmds, free_cmd);
705 }
706 ra_data->cmds = NULL;
707 ra_data->recurring_cmds = NULL;
708 ra_data->cur_cmd = NULL;
709
710 if (cmd) {
711 cmd->rc = PCMK_OCF_OK;
712 cmd->op_status = PCMK_LRM_OP_DONE;
713
714 report_remote_ra_result(cmd);
715 }
716 }
717
718 static int
719 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
720 {
721 const char *server = NULL;
722 lrmd_key_value_t *tmp = NULL;
723 int port = 0;
724 int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
725
726 for (tmp = cmd->params; tmp; tmp = tmp->next) {
727 if (safe_str_eq(tmp->key, "addr") || safe_str_eq(tmp->key, "server")) {
728 server = tmp->value;
729 }
730 if (safe_str_eq(tmp->key, "port")) {
731 port = atoi(tmp->value);
732 }
733 }
734
735 return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used);
736 }
737
738 static gboolean
739 handle_remote_ra_exec(gpointer user_data)
740 {
741 int rc = 0;
742 lrm_state_t *lrm_state = user_data;
743 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
744 remote_ra_cmd_t *cmd;
745 GList *first = NULL;
746
747 if (ra_data->cur_cmd) {
748
749 return TRUE;
750 }
751
752 while (ra_data->cmds) {
753 first = ra_data->cmds;
754 cmd = first->data;
755 if (cmd->delay_id) {
756
757 return TRUE;
758 }
759
760 ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
761 g_list_free_1(first);
762
763 if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
764 ra_data->migrate_status = 0;
765 rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout);
766 if (rc == 0) {
767
768 crm_debug("began remote lrmd connect, waiting for connect event.");
769 ra_data->cur_cmd = cmd;
770 return TRUE;
771 } else {
772 crm_debug("connect failed, not expecting to match any connection event later");
773 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
774 cmd->op_status = PCMK_LRM_OP_ERROR;
775 }
776 report_remote_ra_result(cmd);
777
778 } else if (!strcmp(cmd->action, "monitor")) {
779
780 if (lrm_state_is_connected(lrm_state) == TRUE) {
781 rc = lrm_state_poke_connection(lrm_state);
782 if (rc < 0) {
783 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
784 cmd->op_status = PCMK_LRM_OP_ERROR;
785 }
786 } else {
787 rc = -1;
788 cmd->op_status = PCMK_LRM_OP_DONE;
789 cmd->rc = PCMK_OCF_NOT_RUNNING;
790 }
791
792 if (rc == 0) {
793 crm_debug("poked remote lrmd at node %s, waiting for async response.", cmd->rsc_id);
794 ra_data->cur_cmd = cmd;
795 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
796 return TRUE;
797 }
798 report_remote_ra_result(cmd);
799
800 } else if (!strcmp(cmd->action, "stop")) {
801
802 if (ra_data->migrate_status == expect_takeover) {
803
804
805
806
807
808
809 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
810 ra_data->cur_cmd = cmd;
811 return TRUE;
812 }
813
814 handle_remote_ra_stop(lrm_state, cmd);
815
816 } else if (!strcmp(cmd->action, "migrate_to")) {
817 ra_data->migrate_status = expect_takeover;
818 cmd->rc = PCMK_OCF_OK;
819 cmd->op_status = PCMK_LRM_OP_DONE;
820 report_remote_ra_result(cmd);
821 } else if (!strcmp(cmd->action, "reload")) {
822
823 cmd->rc = PCMK_OCF_OK;
824 cmd->op_status = PCMK_LRM_OP_DONE;
825 report_remote_ra_result(cmd);
826 }
827
828 free_cmd(cmd);
829 }
830
831 return TRUE;
832 }
833
834 static void
835 remote_ra_data_init(lrm_state_t * lrm_state)
836 {
837 remote_ra_data_t *ra_data = NULL;
838
839 if (lrm_state->remote_ra_data) {
840 return;
841 }
842
843 ra_data = calloc(1, sizeof(remote_ra_data_t));
844 ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
845 lrm_state->remote_ra_data = ra_data;
846 }
847
848 void
849 remote_ra_cleanup(lrm_state_t * lrm_state)
850 {
851 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
852
853 if (!ra_data) {
854 return;
855 }
856
857 if (ra_data->cmds) {
858 g_list_free_full(ra_data->cmds, free_cmd);
859 }
860
861 if (ra_data->recurring_cmds) {
862 g_list_free_full(ra_data->recurring_cmds, free_cmd);
863 }
864 mainloop_destroy_trigger(ra_data->work);
865 free(ra_data);
866 lrm_state->remote_ra_data = NULL;
867 }
868
869 gboolean
870 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
871 {
872 if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
873 return TRUE;
874 }
875 if (id && lrm_state_find(id) && safe_str_neq(id, fsa_our_uname)) {
876 return TRUE;
877 }
878
879 return FALSE;
880 }
881
882 lrmd_rsc_info_t *
883 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
884 {
885 lrmd_rsc_info_t *info = NULL;
886
887 if ((lrm_state_find(rsc_id))) {
888 info = calloc(1, sizeof(lrmd_rsc_info_t));
889
890 info->id = strdup(rsc_id);
891 info->type = strdup(REMOTE_LRMD_RA);
892 info->class = strdup(PCMK_RESOURCE_CLASS_OCF);
893 info->provider = strdup("pacemaker");
894 }
895
896 return info;
897 }
898
899 static gboolean
900 is_remote_ra_supported_action(const char *action)
901 {
902 if (!action) {
903 return FALSE;
904 } else if (strcmp(action, "start") &&
905 strcmp(action, "stop") &&
906 strcmp(action, "reload") &&
907 strcmp(action, "migrate_to") &&
908 strcmp(action, "migrate_from") && strcmp(action, "monitor")) {
909 return FALSE;
910 }
911
912 return TRUE;
913 }
914
915 static GList *
916 fail_all_monitor_cmds(GList * list)
917 {
918 GList *rm_list = NULL;
919 remote_ra_cmd_t *cmd = NULL;
920 GListPtr gIter = NULL;
921
922 for (gIter = list; gIter != NULL; gIter = gIter->next) {
923 cmd = gIter->data;
924 if (cmd->interval > 0 && safe_str_eq(cmd->action, "monitor")) {
925 rm_list = g_list_append(rm_list, cmd);
926 }
927 }
928
929 for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
930 cmd = gIter->data;
931
932 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
933 cmd->op_status = PCMK_LRM_OP_ERROR;
934 crm_trace("Pre-emptively failing %s %s (interval=%d, %s)", cmd->action, cmd->rsc_id, cmd->interval, cmd->userdata);
935 report_remote_ra_result(cmd);
936
937 list = g_list_remove(list, cmd);
938 free_cmd(cmd);
939 }
940
941
942 g_list_free(rm_list);
943 return list;
944 }
945
946 static GList *
947 remove_cmd(GList * list, const char *action, int interval)
948 {
949 remote_ra_cmd_t *cmd = NULL;
950 GListPtr gIter = NULL;
951
952 for (gIter = list; gIter != NULL; gIter = gIter->next) {
953 cmd = gIter->data;
954 if (cmd->interval == interval && safe_str_eq(cmd->action, action)) {
955 break;
956 }
957 cmd = NULL;
958 }
959 if (cmd) {
960 list = g_list_remove(list, cmd);
961 free_cmd(cmd);
962 }
963 return list;
964 }
965
966 int
967 remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval)
968 {
969 lrm_state_t *connection_rsc = NULL;
970 remote_ra_data_t *ra_data = NULL;
971
972 connection_rsc = lrm_state_find(rsc_id);
973 if (!connection_rsc || !connection_rsc->remote_ra_data) {
974 return -EINVAL;
975 }
976
977 ra_data = connection_rsc->remote_ra_data;
978 ra_data->cmds = remove_cmd(ra_data->cmds, action, interval);
979 ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action, interval);
980 if (ra_data->cur_cmd &&
981 (ra_data->cur_cmd->interval == interval) &&
982 (safe_str_eq(ra_data->cur_cmd->action, action))) {
983
984 ra_data->cur_cmd->cancel = TRUE;
985 }
986
987 return 0;
988 }
989
990 static remote_ra_cmd_t *
991 handle_dup_monitor(remote_ra_data_t *ra_data, int interval, const char *userdata)
992 {
993 GList *gIter = NULL;
994 remote_ra_cmd_t *cmd = NULL;
995
996
997
998
999
1000
1001
1002 if (interval == 0) {
1003 return NULL;
1004 }
1005
1006 if (ra_data->cur_cmd &&
1007 ra_data->cur_cmd->cancel == FALSE &&
1008 ra_data->cur_cmd->interval == interval &&
1009 safe_str_eq(ra_data->cur_cmd->action, "monitor")) {
1010
1011 cmd = ra_data->cur_cmd;
1012 goto handle_dup;
1013 }
1014
1015 for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1016 cmd = gIter->data;
1017 if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) {
1018 goto handle_dup;
1019 }
1020 }
1021
1022 for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1023 cmd = gIter->data;
1024 if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) {
1025 goto handle_dup;
1026 }
1027 }
1028
1029 return NULL;
1030
1031 handle_dup:
1032
1033 crm_trace("merging duplicate monitor cmd %s_monitor_%d", cmd->rsc_id, interval);
1034
1035
1036 if (userdata) {
1037 free(cmd->userdata);
1038 cmd->userdata = strdup(userdata);
1039 }
1040
1041
1042 if (cmd->reported_success) {
1043 cmd->start_time = time(NULL);
1044 cmd->call_id = generate_callid();
1045 cmd->reported_success = 0;
1046 }
1047
1048
1049
1050
1051 if (cmd->interval_id) {
1052 g_source_remove(cmd->interval_id);
1053 cmd->interval_id = 0;
1054 recurring_helper(cmd);
1055 }
1056
1057 return cmd;
1058 }
1059
1060 int
1061 remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval,
1062 int timeout,
1063 int start_delay,
1064 lrmd_key_value_t * params)
1065 {
1066 int rc = 0;
1067 lrm_state_t *connection_rsc = NULL;
1068 remote_ra_cmd_t *cmd = NULL;
1069 remote_ra_data_t *ra_data = NULL;
1070
1071 if (is_remote_ra_supported_action(action) == FALSE) {
1072 rc = -EINVAL;
1073 goto exec_done;
1074 }
1075
1076 connection_rsc = lrm_state_find(rsc_id);
1077 if (!connection_rsc) {
1078 rc = -EINVAL;
1079 goto exec_done;
1080 }
1081
1082 remote_ra_data_init(connection_rsc);
1083 ra_data = connection_rsc->remote_ra_data;
1084
1085 cmd = handle_dup_monitor(ra_data, interval, userdata);
1086 if (cmd) {
1087 return cmd->call_id;
1088 }
1089
1090 cmd = calloc(1, sizeof(remote_ra_cmd_t));
1091 cmd->owner = strdup(lrm_state->node_name);
1092 cmd->rsc_id = strdup(rsc_id);
1093 cmd->action = strdup(action);
1094 cmd->userdata = strdup(userdata);
1095 cmd->interval = interval;
1096 cmd->timeout = timeout;
1097 cmd->start_delay = start_delay;
1098 cmd->params = params;
1099 cmd->start_time = time(NULL);
1100
1101 cmd->call_id = generate_callid();
1102
1103 if (cmd->start_delay) {
1104 cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1105 }
1106
1107 ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1108 mainloop_set_trigger(ra_data->work);
1109
1110 return cmd->call_id;
1111 exec_done:
1112
1113 lrmd_key_value_freeall(params);
1114 return rc;
1115 }
1116
1117
1118
1119
1120
1121
1122
1123 void
1124 remote_ra_fail(const char *node_name)
1125 {
1126 lrm_state_t *lrm_state = lrm_state_find(node_name);
1127
1128 if (lrm_state && lrm_state_is_connected(lrm_state)) {
1129 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1130
1131 crm_info("Failing monitors on pacemaker_remote node %s", node_name);
1132 ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1133 ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1134 }
1135 }
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149 #define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1150 "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1151 "/" XML_CIB_TAG_NODE
1152
1153
1154
1155
1156
1157
1158
1159 void
1160 remote_ra_process_pseudo(xmlNode *xml)
1161 {
1162 xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1163
1164 if (numXpathResults(search) == 1) {
1165 xmlNode *result = getXpathResult(search, 0);
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181 if (result) {
1182 const char *remote = ID(result);
1183
1184 if (remote) {
1185 remote_node_down(remote, DOWN_ERASE_LRM);
1186 }
1187 }
1188 }
1189 freeXpathObject(search);
1190 }
1191
1192 static void
1193 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
1194 {
1195 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1196 xmlNode *update, *state;
1197 int call_opt, call_id = 0;
1198 crm_node_t *node;
1199
1200 call_opt = crmd_cib_smart_opt();
1201 node = crm_remote_peer_get(lrm_state->node_name);
1202 CRM_CHECK(node != NULL, return);
1203 update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1204 state = create_node_state_update(node, node_update_none, update,
1205 __FUNCTION__);
1206 crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1207 fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
1208 if (call_id < 0) {
1209 crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
1210 } else {
1211
1212 ra_data->is_maintenance = maintenance;
1213 }
1214 free_xml(update);
1215 }
1216
1217 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1218 "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1219 XML_GRAPH_TAG_MAINTENANCE
1220
1221
1222
1223
1224
1225
1226
1227
1228 void
1229 remote_ra_process_maintenance_nodes(xmlNode *xml)
1230 {
1231 xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1232
1233 if (numXpathResults(search) == 1) {
1234 xmlNode *node;
1235 int cnt = 0, cnt_remote = 0;
1236
1237 for (node =
1238 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1239 node; node = __xml_next(node)) {
1240 lrm_state_t *lrm_state = lrm_state_find(ID(node));
1241
1242 cnt++;
1243 if (lrm_state && lrm_state->remote_ra_data &&
1244 ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
1245 cnt_remote++;
1246 remote_ra_maintenance(lrm_state,
1247 crm_atoi(crm_element_value(node,
1248 XML_NODE_IS_MAINTENANCE), "0"));
1249
1250 }
1251 }
1252 crm_trace("Action holds %d nodes (%d remotes found) "
1253 "adjusting maintenance-mode", cnt, cnt_remote);
1254 }
1255 freeXpathObject(search);
1256 }
1257
1258 gboolean
1259 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
1260 {
1261 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1262
1263 return ra_data->is_maintenance;
1264 }