This source file includes following definitions.
- free_cmd
- generate_callid
- recurring_helper
- start_delay_helper
- should_purge_attributes
- section_to_delete
- purge_remote_node_attrs
- remote_node_up
- remote_node_down
- check_remote_node_state
- report_remote_ra_result
- remaining_timeout_sec
- retry_start_cmd_cb
- connection_takeover_timeout_cb
- monitor_timeout_cb
- synthesize_lrmd_success
- remote_lrm_op_callback
- handle_remote_ra_stop
- handle_remote_ra_start
- handle_remote_ra_exec
- remote_ra_data_init
- remote_ra_cleanup
- is_remote_lrmd_ra
- remote_ra_get_rsc_info
- is_remote_ra_supported_action
- fail_all_monitor_cmds
- remove_cmd
- remote_ra_cancel
- handle_dup_monitor
- controld_execute_remote_agent
- remote_ra_fail
- remote_ra_process_pseudo
- remote_ra_maintenance
- remote_ra_process_maintenance_nodes
- remote_ra_is_in_maintenance
- remote_ra_controlling_guest
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <crm/crm.h>
13 #include <crm/common/xml.h>
14 #include <crm/common/xml_internal.h>
15 #include <crm/lrmd.h>
16 #include <crm/lrmd_internal.h>
17 #include <crm/services.h>
18
19 #include <libxml/xpath.h>
20
21 #include <pacemaker-controld.h>
22
23 #define REMOTE_LRMD_RA "remote"
24
25
26 #define MAX_START_TIMEOUT_MS 10000
27
28 #define cmd_set_flags(cmd, flags_to_set) do { \
29 (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
30 "Remote command", (cmd)->rsc_id, (cmd)->status, \
31 (flags_to_set), #flags_to_set); \
32 } while (0)
33
34 #define cmd_clear_flags(cmd, flags_to_clear) do { \
35 (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
36 "Remote command", (cmd)->rsc_id, (cmd)->status, \
37 (flags_to_clear), #flags_to_clear); \
38 } while (0)
39
40 enum remote_cmd_status {
41 cmd_reported_success = (1 << 0),
42 cmd_cancel = (1 << 1),
43 };
44
45 typedef struct remote_ra_cmd_s {
46
47 char *owner;
48
49 char *rsc_id;
50
51 char *action;
52
53 char *userdata;
54
55 int start_delay;
56
57 int delay_id;
58
59 int timeout;
60
61 guint interval_ms;
62
63 int interval_id;
64 int monitor_timeout_id;
65 int takeover_timeout_id;
66
67 lrmd_key_value_t *params;
68 pcmk__action_result_t result;
69 int call_id;
70 time_t start_time;
71 uint32_t status;
72 } remote_ra_cmd_t;
73
74 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
75 lrm_state_t *lrm = (lrm_state); \
76 remote_ra_data_t *ra = lrm->remote_ra_data; \
77 ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
78 lrm->node_name, ra->status, \
79 (flags_to_set), #flags_to_set); \
80 } while (0)
81
82 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
83 lrm_state_t *lrm = (lrm_state); \
84 remote_ra_data_t *ra = lrm->remote_ra_data; \
85 ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
86 lrm->node_name, ra->status, \
87 (flags_to_clear), #flags_to_clear); \
88 } while (0)
89
90 enum remote_status {
91 expect_takeover = (1 << 0),
92 takeover_complete = (1 << 1),
93 remote_active = (1 << 2),
94
95
96
97 remote_in_maint = (1 << 3),
98
99
100
101
102
103
104 controlling_guest = (1 << 4),
105 };
106
107 typedef struct remote_ra_data_s {
108 crm_trigger_t *work;
109 remote_ra_cmd_t *cur_cmd;
110 GList *cmds;
111 GList *recurring_cmds;
112 uint32_t status;
113 } remote_ra_data_t;
114
115 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
116 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
117 static GList *fail_all_monitor_cmds(GList * list);
118
119 static void
120 free_cmd(gpointer user_data)
121 {
122 remote_ra_cmd_t *cmd = user_data;
123
124 if (!cmd) {
125 return;
126 }
127 if (cmd->delay_id) {
128 g_source_remove(cmd->delay_id);
129 }
130 if (cmd->interval_id) {
131 g_source_remove(cmd->interval_id);
132 }
133 if (cmd->monitor_timeout_id) {
134 g_source_remove(cmd->monitor_timeout_id);
135 }
136 if (cmd->takeover_timeout_id) {
137 g_source_remove(cmd->takeover_timeout_id);
138 }
139 free(cmd->owner);
140 free(cmd->rsc_id);
141 free(cmd->action);
142 free(cmd->userdata);
143 pcmk__reset_result(&(cmd->result));
144 lrmd_key_value_freeall(cmd->params);
145 free(cmd);
146 }
147
148 static int
149 generate_callid(void)
150 {
151 static int remote_ra_callid = 0;
152
153 remote_ra_callid++;
154 if (remote_ra_callid <= 0) {
155 remote_ra_callid = 1;
156 }
157
158 return remote_ra_callid;
159 }
160
161 static gboolean
162 recurring_helper(gpointer data)
163 {
164 remote_ra_cmd_t *cmd = data;
165 lrm_state_t *connection_rsc = NULL;
166
167 cmd->interval_id = 0;
168 connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
169 if (connection_rsc && connection_rsc->remote_ra_data) {
170 remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
171
172 ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
173
174 ra_data->cmds = g_list_append(ra_data->cmds, cmd);
175 mainloop_set_trigger(ra_data->work);
176 }
177 return FALSE;
178 }
179
180 static gboolean
181 start_delay_helper(gpointer data)
182 {
183 remote_ra_cmd_t *cmd = data;
184 lrm_state_t *connection_rsc = NULL;
185
186 cmd->delay_id = 0;
187 connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
188 if (connection_rsc && connection_rsc->remote_ra_data) {
189 remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
190
191 mainloop_set_trigger(ra_data->work);
192 }
193 return FALSE;
194 }
195
196 static bool
197 should_purge_attributes(pcmk__node_status_t *node)
198 {
199 pcmk__node_status_t *conn_node = NULL;
200 lrm_state_t *connection_rsc = NULL;
201
202 if ((node->conn_host == NULL) || (node->name == NULL)) {
203 return true;
204 }
205
206
207
208
209 conn_node = pcmk__get_node(0, node->conn_host, NULL,
210 pcmk__node_search_cluster_member);
211 if (conn_node == NULL) {
212 return true;
213 }
214
215
216
217
218
219 connection_rsc = controld_get_executor_state(node->name, false);
220
221 if (connection_rsc != NULL) {
222 lrmd_t *lrm = connection_rsc->conn;
223 time_t uptime = lrmd__uptime(lrm);
224 time_t now = time(NULL);
225
226
227
228
229
230 if (uptime > 0 &&
231 conn_node->peer_lost > 0 &&
232 uptime + 20 >= now - conn_node->peer_lost) {
233 return false;
234 }
235 }
236
237 return true;
238 }
239
240 static enum controld_section_e
241 section_to_delete(bool purge)
242 {
243 if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
244 if (purge) {
245 return controld_section_all_unlocked;
246 } else {
247 return controld_section_lrm_unlocked;
248 }
249 } else {
250 if (purge) {
251 return controld_section_all;
252 } else {
253 return controld_section_lrm;
254 }
255 }
256 }
257
258 static void
259 purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
260 {
261 bool purge = should_purge_attributes(node);
262 enum controld_section_e section = section_to_delete(purge);
263
264
265 if (purge) {
266 update_attrd_remote_node_removed(node->name, NULL);
267 }
268
269 controld_delete_node_state(node->name, section, call_opt);
270 }
271
272
273
274
275
276
277
278 static void
279 remote_node_up(const char *node_name)
280 {
281 int call_opt;
282 xmlNode *update, *state;
283 pcmk__node_status_t *node = NULL;
284 lrm_state_t *connection_rsc = NULL;
285
286 CRM_CHECK(node_name != NULL, return);
287 crm_info("Announcing Pacemaker Remote node %s", node_name);
288
289 call_opt = crmd_cib_smart_opt();
290
291
292
293
294
295
296
297
298
299 update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
300
301
302 node = pcmk__cluster_lookup_remote_node(node_name);
303 CRM_CHECK((node != NULL) && (node->name != NULL), return);
304
305 purge_remote_node_attrs(call_opt, node);
306 pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
307
308
309
310
311 connection_rsc = controld_get_executor_state(node->name, false);
312
313 if (connection_rsc != NULL) {
314 lrmd_t *lrm = connection_rsc->conn;
315 const char *start_state = lrmd__node_start_state(lrm);
316
317 if (start_state) {
318 set_join_state(start_state, node->name, node->xml_id, true);
319 }
320 }
321
322
323
324
325
326
327
328 broadcast_remote_state_message(node_name, true);
329
330 update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
331 state = create_node_state_update(node, controld_node_update_cluster, update,
332 __func__);
333
334
335
336
337
338 crm_xml_add(state, PCMK__XA_NODE_FENCED, "0");
339
340
341
342
343
344
345
346
347 controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
348 pcmk__xml_free(update);
349 }
350
351 enum down_opts {
352 DOWN_KEEP_LRM,
353 DOWN_ERASE_LRM
354 };
355
356
357
358
359
360
361
362
363 static void
364 remote_node_down(const char *node_name, const enum down_opts opts)
365 {
366 xmlNode *update;
367 int call_opt = crmd_cib_smart_opt();
368 pcmk__node_status_t *node = NULL;
369
370
371 update_attrd_remote_node_removed(node_name, NULL);
372
373
374
375
376
377
378 if (opts == DOWN_ERASE_LRM) {
379 controld_delete_node_state(node_name, controld_section_all, call_opt);
380 } else {
381 controld_delete_node_state(node_name, controld_section_attrs, call_opt);
382 }
383
384
385 node = pcmk__cluster_lookup_remote_node(node_name);
386 CRM_CHECK(node != NULL, return);
387 pcmk__update_peer_state(__func__, node, PCMK__VALUE_LOST, 0);
388
389
390 broadcast_remote_state_message(node_name, false);
391
392
393 update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
394 create_node_state_update(node, controld_node_update_cluster, update,
395 __func__);
396 controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
397 pcmk__xml_free(update);
398 }
399
400
401
402
403
404
405
406 static void
407 check_remote_node_state(const remote_ra_cmd_t *cmd)
408 {
409
410 if (!pcmk__result_ok(&(cmd->result))) {
411 return;
412 }
413
414 if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
415 remote_node_up(cmd->rsc_id);
416
417 } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
418 pcmk__str_casei)) {
419
420
421
422
423
424
425
426 pcmk__node_status_t *node =
427 pcmk__cluster_lookup_remote_node(cmd->rsc_id);
428
429 CRM_CHECK(node != NULL, return);
430 pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
431
432 } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
433 lrm_state_t *lrm_state = controld_get_executor_state(cmd->rsc_id,
434 false);
435 remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
436
437 if (ra_data) {
438 if (!pcmk_is_set(ra_data->status, takeover_complete)) {
439
440 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
441 } else if (AM_I_DC == FALSE) {
442
443
444
445
446 pcmk__cluster_forget_remote_node(cmd->rsc_id);
447 }
448 }
449 }
450
451
452
453
454
455
456
457
458
459
460 }
461
462 static void
463 report_remote_ra_result(remote_ra_cmd_t * cmd)
464 {
465 lrmd_event_data_t op = { 0, };
466
467 check_remote_node_state(cmd);
468
469 op.type = lrmd_event_exec_complete;
470 op.rsc_id = cmd->rsc_id;
471 op.op_type = cmd->action;
472 op.user_data = cmd->userdata;
473 op.timeout = cmd->timeout;
474 op.interval_ms = cmd->interval_ms;
475 op.t_run = cmd->start_time;
476 op.t_rcchange = cmd->start_time;
477
478 lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
479 cmd->result.exit_reason);
480
481 if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
482 op.t_rcchange = time(NULL);
483
484
485
486
487
488
489
490
491
492 if (op.t_rcchange == op.t_run) {
493 op.t_rcchange++;
494 }
495 }
496
497 if (cmd->params) {
498 lrmd_key_value_t *tmp;
499
500 op.params = pcmk__strkey_table(free, free);
501 for (tmp = cmd->params; tmp; tmp = tmp->next) {
502 pcmk__insert_dup(op.params, tmp->key, tmp->value);
503 }
504
505 }
506 op.call_id = cmd->call_id;
507 op.remote_nodename = cmd->owner;
508
509 lrm_op_callback(&op);
510
511 if (op.params) {
512 g_hash_table_destroy(op.params);
513 }
514 lrmd__reset_result(&op);
515 }
516
517
518
519
520
521
522
523
524
525 static int
526 remaining_timeout_sec(const remote_ra_cmd_t *cmd)
527 {
528 return pcmk__timeout_ms2s(cmd->timeout) - (time(NULL) - cmd->start_time);
529 }
530
531 static gboolean
532 retry_start_cmd_cb(gpointer data)
533 {
534 lrm_state_t *lrm_state = data;
535 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
536 remote_ra_cmd_t *cmd = NULL;
537 int rc = ETIME;
538 int remaining = 0;
539
540 if (!ra_data || !ra_data->cur_cmd) {
541 return FALSE;
542 }
543 cmd = ra_data->cur_cmd;
544 if (!pcmk__is_up_action(cmd->action)) {
545 return FALSE;
546 }
547
548 remaining = remaining_timeout_sec(cmd);
549 if (remaining > 0) {
550 rc = handle_remote_ra_start(lrm_state, cmd, remaining * 1000);
551 } else {
552 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
553 PCMK_EXEC_TIMEOUT,
554 "Not enough time remains to retry remote connection");
555 }
556
557 if (rc != pcmk_rc_ok) {
558 report_remote_ra_result(cmd);
559
560 if (ra_data->cmds) {
561 mainloop_set_trigger(ra_data->work);
562 }
563 ra_data->cur_cmd = NULL;
564 free_cmd(cmd);
565 } else {
566
567 }
568
569 return FALSE;
570 }
571
572
573 static gboolean
574 connection_takeover_timeout_cb(gpointer data)
575 {
576 lrm_state_t *lrm_state = NULL;
577 remote_ra_cmd_t *cmd = data;
578
579 crm_info("takeover event timed out for node %s", cmd->rsc_id);
580 cmd->takeover_timeout_id = 0;
581
582 lrm_state = controld_get_executor_state(cmd->rsc_id, false);
583
584 handle_remote_ra_stop(lrm_state, cmd);
585 free_cmd(cmd);
586
587 return FALSE;
588 }
589
590 static gboolean
591 monitor_timeout_cb(gpointer data)
592 {
593 lrm_state_t *lrm_state = NULL;
594 remote_ra_cmd_t *cmd = data;
595
596 lrm_state = controld_get_executor_state(cmd->rsc_id, false);
597
598 crm_info("Timed out waiting for remote poke response from %s%s",
599 cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
600 cmd->monitor_timeout_id = 0;
601 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
602 "Remote executor did not respond");
603
604 if (lrm_state && lrm_state->remote_ra_data) {
605 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
606
607 if (ra_data->cur_cmd == cmd) {
608 ra_data->cur_cmd = NULL;
609 }
610 if (ra_data->cmds) {
611 mainloop_set_trigger(ra_data->work);
612 }
613 }
614
615 report_remote_ra_result(cmd);
616 free_cmd(cmd);
617
618 if(lrm_state) {
619
620 lrm_state_disconnect(lrm_state);
621 }
622 return FALSE;
623 }
624
625 static void
626 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
627 {
628 lrmd_event_data_t op = { 0, };
629
630 if (lrm_state == NULL) {
631
632 lrm_state = controld_get_executor_state(NULL, false);
633 }
634 pcmk__assert(lrm_state != NULL);
635
636 op.type = lrmd_event_exec_complete;
637 op.rsc_id = rsc_id;
638 op.op_type = op_type;
639 op.t_run = time(NULL);
640 op.t_rcchange = op.t_run;
641 op.call_id = generate_callid();
642 lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
643 process_lrm_event(lrm_state, &op, NULL, NULL);
644 }
645
646 void
647 remote_lrm_op_callback(lrmd_event_data_t * op)
648 {
649 gboolean cmd_handled = FALSE;
650 lrm_state_t *lrm_state = NULL;
651 remote_ra_data_t *ra_data = NULL;
652 remote_ra_cmd_t *cmd = NULL;
653
654 CRM_CHECK((op != NULL) && (op->remote_nodename != NULL), return);
655
656 crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
657 "(%d) status=%s (%d)",
658 (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
659 lrmd_event_type2str(op->type), op->remote_nodename,
660 crm_exit_str((crm_exit_t) op->rc), op->rc,
661 pcmk_exec_status_str(op->op_status), op->op_status);
662
663 lrm_state = controld_get_executor_state(op->remote_nodename, false);
664 if (!lrm_state || !lrm_state->remote_ra_data) {
665 crm_debug("No state information found for remote connection event");
666 return;
667 }
668 ra_data = lrm_state->remote_ra_data;
669
670 if (op->type == lrmd_event_new_client) {
671
672
673 if (pcmk_is_set(ra_data->status, expect_takeover)) {
674
675 lrm_remote_clear_flags(lrm_state, expect_takeover);
676 lrm_remote_set_flags(lrm_state, takeover_complete);
677
678 } else {
679 crm_err("Disconnecting from Pacemaker Remote node %s due to "
680 "unexpected client takeover", op->remote_nodename);
681
682
683
684 lrm_state_disconnect_only(lrm_state);
685 }
686 return;
687 }
688
689
690 if (op->type == lrmd_event_exec_complete) {
691 if (pcmk_is_set(ra_data->status, takeover_complete)) {
692 crm_debug("ignoring event, this connection is taken over by another node");
693 } else {
694 lrm_op_callback(op);
695 }
696 return;
697 }
698
699 if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
700
701 if (!pcmk_is_set(ra_data->status, remote_active)) {
702 crm_debug("Disconnection from Pacemaker Remote node %s complete",
703 lrm_state->node_name);
704
705 } else if (!remote_ra_is_in_maintenance(lrm_state)) {
706 crm_err("Lost connection to Pacemaker Remote node %s",
707 lrm_state->node_name);
708 ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
709 ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
710
711 } else {
712 crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
713 lrm_state->node_name);
714
715 handle_remote_ra_stop(lrm_state, NULL);
716 remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
717
718 synthesize_lrmd_success(NULL, lrm_state->node_name,
719 PCMK_ACTION_STOP);
720 }
721 return;
722 }
723
724 if (!ra_data->cur_cmd) {
725 crm_debug("no event to match");
726 return;
727 }
728
729 cmd = ra_data->cur_cmd;
730
731
732
733 if ((op->type == lrmd_event_connect) && pcmk__is_up_action(cmd->action)) {
734 if (op->connection_rc < 0) {
735 int remaining = remaining_timeout_sec(cmd);
736
737 if ((op->connection_rc == -ENOKEY)
738 || (op->connection_rc == -EKEYREJECTED)) {
739
740 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
741 PCMK_EXEC_ERROR,
742 pcmk_strerror(op->connection_rc));
743
744 } else if (remaining > 3) {
745 crm_trace("Rescheduling start (%ds remains before timeout)",
746 remaining);
747 pcmk__create_timer(1000, retry_start_cmd_cb, lrm_state);
748 return;
749
750 } else {
751 crm_trace("Not enough time before timeout (%ds) "
752 "to reschedule start", remaining);
753 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
754 PCMK_EXEC_TIMEOUT,
755 "%s without enough time to retry",
756 pcmk_strerror(op->connection_rc));
757 }
758
759 } else {
760 lrm_state_reset_tables(lrm_state, TRUE);
761 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
762 lrm_remote_set_flags(lrm_state, remote_active);
763 }
764
765 crm_debug("Remote connection event matched %s action", cmd->action);
766 report_remote_ra_result(cmd);
767 cmd_handled = TRUE;
768
769 } else if ((op->type == lrmd_event_poke)
770 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
771 pcmk__str_casei)) {
772
773 if (cmd->monitor_timeout_id) {
774 g_source_remove(cmd->monitor_timeout_id);
775 cmd->monitor_timeout_id = 0;
776 }
777
778
779
780
781 if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
782 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
783 report_remote_ra_result(cmd);
784 cmd_set_flags(cmd, cmd_reported_success);
785 }
786
787 crm_debug("Remote poke event matched %s action", cmd->action);
788
789
790 if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
791 ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
792 cmd->interval_id = pcmk__create_timer(cmd->interval_ms,
793 recurring_helper, cmd);
794 cmd = NULL;
795 }
796 cmd_handled = TRUE;
797
798 } else if ((op->type == lrmd_event_disconnect)
799 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
800 pcmk__str_casei)) {
801 if (pcmk_is_set(ra_data->status, remote_active) &&
802 !pcmk_is_set(cmd->status, cmd_cancel)) {
803 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
804 PCMK_EXEC_ERROR,
805 "Remote connection unexpectedly dropped "
806 "during monitor");
807 report_remote_ra_result(cmd);
808 crm_err("Remote connection to %s unexpectedly dropped during monitor",
809 lrm_state->node_name);
810 }
811 cmd_handled = TRUE;
812
813 } else {
814 crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
815 }
816
817 if (cmd_handled) {
818 ra_data->cur_cmd = NULL;
819 if (ra_data->cmds) {
820 mainloop_set_trigger(ra_data->work);
821 }
822 free_cmd(cmd);
823 }
824 }
825
826 static void
827 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
828 {
829 remote_ra_data_t *ra_data = NULL;
830
831 pcmk__assert(lrm_state != NULL);
832 ra_data = lrm_state->remote_ra_data;
833
834 if (!pcmk_is_set(ra_data->status, takeover_complete)) {
835
836 g_hash_table_remove_all(lrm_state->active_ops);
837 } else {
838
839
840 lrm_state_reset_tables(lrm_state, FALSE);
841 }
842
843 lrm_remote_clear_flags(lrm_state, remote_active);
844 lrm_state_disconnect(lrm_state);
845
846 if (ra_data->cmds) {
847 g_list_free_full(ra_data->cmds, free_cmd);
848 }
849 if (ra_data->recurring_cmds) {
850 g_list_free_full(ra_data->recurring_cmds, free_cmd);
851 }
852 ra_data->cmds = NULL;
853 ra_data->recurring_cmds = NULL;
854 ra_data->cur_cmd = NULL;
855
856 if (cmd) {
857 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
858 report_remote_ra_result(cmd);
859 }
860 }
861
862
863 static int
864 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
865 {
866 const char *server = NULL;
867 lrmd_key_value_t *tmp = NULL;
868 int port = 0;
869 int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
870 int rc = pcmk_rc_ok;
871
872 for (tmp = cmd->params; tmp; tmp = tmp->next) {
873 if (pcmk__strcase_any_of(tmp->key,
874 PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
875 NULL)) {
876 server = tmp->value;
877
878 } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
879 pcmk__str_none)) {
880 port = atoi(tmp->value);
881
882 } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
883 pcmk__str_none)) {
884 lrm_remote_set_flags(lrm_state, controlling_guest);
885 }
886 }
887
888 rc = controld_connect_remote_executor(lrm_state, server, port,
889 timeout_used);
890 if (rc != pcmk_rc_ok) {
891 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
892 PCMK_EXEC_ERROR,
893 "Could not connect to Pacemaker Remote node %s: %s",
894 lrm_state->node_name, pcmk_rc_str(rc));
895 }
896 return rc;
897 }
898
899 static gboolean
900 handle_remote_ra_exec(gpointer user_data)
901 {
902 int rc = 0;
903 lrm_state_t *lrm_state = user_data;
904 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
905 remote_ra_cmd_t *cmd;
906 GList *first = NULL;
907
908 if (ra_data->cur_cmd) {
909
910 return TRUE;
911 }
912
913 while (ra_data->cmds) {
914 first = ra_data->cmds;
915 cmd = first->data;
916 if (cmd->delay_id) {
917
918 return TRUE;
919 }
920
921 ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
922 g_list_free_1(first);
923
924 if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
925 PCMK_ACTION_MIGRATE_FROM, NULL)) {
926 lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
927 if (handle_remote_ra_start(lrm_state, cmd,
928 cmd->timeout) == pcmk_rc_ok) {
929
930 crm_debug("Initiated async remote connection, %s action will complete after connect event",
931 cmd->action);
932 ra_data->cur_cmd = cmd;
933 return TRUE;
934 }
935 report_remote_ra_result(cmd);
936
937 } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
938
939 if (lrm_state_is_connected(lrm_state) == TRUE) {
940 rc = lrm_state_poke_connection(lrm_state);
941 if (rc < 0) {
942 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
943 PCMK_EXEC_ERROR, pcmk_strerror(rc));
944 }
945 } else {
946 rc = -1;
947 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
948 PCMK_EXEC_DONE, "Remote connection inactive");
949 }
950
951 if (rc == 0) {
952 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
953 cmd->rsc_id);
954 ra_data->cur_cmd = cmd;
955 cmd->monitor_timeout_id = pcmk__create_timer(cmd->timeout, monitor_timeout_cb, cmd);
956 return TRUE;
957 }
958 report_remote_ra_result(cmd);
959
960 } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
961
962 if (pcmk_is_set(ra_data->status, expect_takeover)) {
963
964
965
966
967
968
969
970 cmd->takeover_timeout_id = pcmk__create_timer((cmd->timeout/2),
971 connection_takeover_timeout_cb,
972 cmd);
973 ra_data->cur_cmd = cmd;
974 return TRUE;
975 }
976
977 handle_remote_ra_stop(lrm_state, cmd);
978
979 } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
980 lrm_remote_clear_flags(lrm_state, takeover_complete);
981 lrm_remote_set_flags(lrm_state, expect_takeover);
982 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
983 report_remote_ra_result(cmd);
984
985 } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
986 PCMK_ACTION_RELOAD_AGENT, NULL)) {
987
988
989
990
991
992
993
994
995
996 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
997 report_remote_ra_result(cmd);
998 }
999
1000 free_cmd(cmd);
1001 }
1002
1003 return TRUE;
1004 }
1005
1006 static void
1007 remote_ra_data_init(lrm_state_t * lrm_state)
1008 {
1009 remote_ra_data_t *ra_data = NULL;
1010
1011 if (lrm_state->remote_ra_data) {
1012 return;
1013 }
1014
1015 ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
1016 ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1017 lrm_state->remote_ra_data = ra_data;
1018 }
1019
1020 void
1021 remote_ra_cleanup(lrm_state_t * lrm_state)
1022 {
1023 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1024
1025 if (!ra_data) {
1026 return;
1027 }
1028
1029 if (ra_data->cmds) {
1030 g_list_free_full(ra_data->cmds, free_cmd);
1031 }
1032
1033 if (ra_data->recurring_cmds) {
1034 g_list_free_full(ra_data->recurring_cmds, free_cmd);
1035 }
1036 mainloop_destroy_trigger(ra_data->work);
1037 free(ra_data);
1038 lrm_state->remote_ra_data = NULL;
1039 }
1040
1041 gboolean
1042 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
1043 {
1044 if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1045 return TRUE;
1046 }
1047 return (id != NULL) && (controld_get_executor_state(id, false) != NULL)
1048 && !controld_is_local_node(id);
1049 }
1050
1051 lrmd_rsc_info_t *
1052 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
1053 {
1054 lrmd_rsc_info_t *info = NULL;
1055
1056 CRM_CHECK(rsc_id != NULL, return NULL);
1057
1058 if (controld_get_executor_state(rsc_id, false) != NULL) {
1059 info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
1060
1061 info->id = pcmk__str_copy(rsc_id);
1062 info->type = pcmk__str_copy(REMOTE_LRMD_RA);
1063 info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
1064 info->provider = pcmk__str_copy("pacemaker");
1065 }
1066
1067 return info;
1068 }
1069
1070 static gboolean
1071 is_remote_ra_supported_action(const char *action)
1072 {
1073 return pcmk__str_any_of(action,
1074 PCMK_ACTION_START,
1075 PCMK_ACTION_STOP,
1076 PCMK_ACTION_MONITOR,
1077 PCMK_ACTION_MIGRATE_TO,
1078 PCMK_ACTION_MIGRATE_FROM,
1079 PCMK_ACTION_RELOAD_AGENT,
1080 PCMK_ACTION_RELOAD,
1081 NULL);
1082 }
1083
1084 static GList *
1085 fail_all_monitor_cmds(GList * list)
1086 {
1087 GList *rm_list = NULL;
1088 remote_ra_cmd_t *cmd = NULL;
1089 GList *gIter = NULL;
1090
1091 for (gIter = list; gIter != NULL; gIter = gIter->next) {
1092 cmd = gIter->data;
1093 if ((cmd->interval_ms > 0)
1094 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1095 pcmk__str_casei)) {
1096 rm_list = g_list_append(rm_list, cmd);
1097 }
1098 }
1099
1100 for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1101 cmd = gIter->data;
1102
1103 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1104 PCMK_EXEC_ERROR, "Lost connection to remote executor");
1105 crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1106 cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1107 report_remote_ra_result(cmd);
1108
1109 list = g_list_remove(list, cmd);
1110 free_cmd(cmd);
1111 }
1112
1113
1114 g_list_free(rm_list);
1115 return list;
1116 }
1117
1118 static GList *
1119 remove_cmd(GList * list, const char *action, guint interval_ms)
1120 {
1121 remote_ra_cmd_t *cmd = NULL;
1122 GList *gIter = NULL;
1123
1124 for (gIter = list; gIter != NULL; gIter = gIter->next) {
1125 cmd = gIter->data;
1126 if ((cmd->interval_ms == interval_ms)
1127 && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1128 break;
1129 }
1130 cmd = NULL;
1131 }
1132 if (cmd) {
1133 list = g_list_remove(list, cmd);
1134 free_cmd(cmd);
1135 }
1136 return list;
1137 }
1138
1139 int
1140 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
1141 const char *action, guint interval_ms)
1142 {
1143 lrm_state_t *connection_rsc = NULL;
1144 remote_ra_data_t *ra_data = NULL;
1145
1146 CRM_CHECK(rsc_id != NULL, return -EINVAL);
1147
1148 connection_rsc = controld_get_executor_state(rsc_id, false);
1149 if (!connection_rsc || !connection_rsc->remote_ra_data) {
1150 return -EINVAL;
1151 }
1152
1153 ra_data = connection_rsc->remote_ra_data;
1154 ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1155 ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1156 interval_ms);
1157 if (ra_data->cur_cmd &&
1158 (ra_data->cur_cmd->interval_ms == interval_ms) &&
1159 (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1160
1161 cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1162 }
1163
1164 return 0;
1165 }
1166
1167 static remote_ra_cmd_t *
1168 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
1169 const char *userdata)
1170 {
1171 GList *gIter = NULL;
1172 remote_ra_cmd_t *cmd = NULL;
1173
1174
1175
1176
1177
1178
1179
1180 if (interval_ms == 0) {
1181 return NULL;
1182 }
1183
1184 if (ra_data->cur_cmd &&
1185 !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1186 (ra_data->cur_cmd->interval_ms == interval_ms)
1187 && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1188 pcmk__str_casei)) {
1189
1190 cmd = ra_data->cur_cmd;
1191 goto handle_dup;
1192 }
1193
1194 for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1195 cmd = gIter->data;
1196 if ((cmd->interval_ms == interval_ms)
1197 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1198 pcmk__str_casei)) {
1199 goto handle_dup;
1200 }
1201 }
1202
1203 for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1204 cmd = gIter->data;
1205 if ((cmd->interval_ms == interval_ms)
1206 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1207 pcmk__str_casei)) {
1208 goto handle_dup;
1209 }
1210 }
1211
1212 return NULL;
1213
1214 handle_dup:
1215
1216 crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1217 cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1218
1219
1220 if (userdata) {
1221 free(cmd->userdata);
1222 cmd->userdata = pcmk__str_copy(userdata);
1223 }
1224
1225
1226 if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1227 cmd->start_time = time(NULL);
1228 cmd->call_id = generate_callid();
1229 cmd_clear_flags(cmd, cmd_reported_success);
1230 }
1231
1232
1233
1234
1235 if (cmd->interval_id) {
1236 g_source_remove(cmd->interval_id);
1237 cmd->interval_id = 0;
1238 recurring_helper(cmd);
1239 }
1240
1241 return cmd;
1242 }
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262 int
1263 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
1264 const char *action, const char *userdata,
1265 guint interval_ms, int timeout_ms,
1266 int start_delay_ms, lrmd_key_value_t *params,
1267 int *call_id)
1268 {
1269 lrm_state_t *connection_rsc = NULL;
1270 remote_ra_cmd_t *cmd = NULL;
1271 remote_ra_data_t *ra_data = NULL;
1272
1273 *call_id = 0;
1274
1275 CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1276 && (userdata != NULL) && (call_id != NULL),
1277 lrmd_key_value_freeall(params); return EINVAL);
1278
1279 if (!is_remote_ra_supported_action(action)) {
1280 lrmd_key_value_freeall(params);
1281 return EOPNOTSUPP;
1282 }
1283
1284 connection_rsc = controld_get_executor_state(rsc_id, false);
1285 if (connection_rsc == NULL) {
1286 lrmd_key_value_freeall(params);
1287 return ENOTCONN;
1288 }
1289
1290 remote_ra_data_init(connection_rsc);
1291 ra_data = connection_rsc->remote_ra_data;
1292
1293 cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1294 if (cmd) {
1295 *call_id = cmd->call_id;
1296 lrmd_key_value_freeall(params);
1297 return pcmk_rc_ok;
1298 }
1299
1300 cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1301
1302 cmd->owner = pcmk__str_copy(lrm_state->node_name);
1303 cmd->rsc_id = pcmk__str_copy(rsc_id);
1304 cmd->action = pcmk__str_copy(action);
1305 cmd->userdata = pcmk__str_copy(userdata);
1306 cmd->interval_ms = interval_ms;
1307 cmd->timeout = timeout_ms;
1308 cmd->start_delay = start_delay_ms;
1309 cmd->params = params;
1310 cmd->start_time = time(NULL);
1311
1312 cmd->call_id = generate_callid();
1313
1314 if (cmd->start_delay) {
1315 cmd->delay_id = pcmk__create_timer(cmd->start_delay, start_delay_helper, cmd);
1316 }
1317
1318 ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1319 mainloop_set_trigger(ra_data->work);
1320
1321 *call_id = cmd->call_id;
1322 return pcmk_rc_ok;
1323 }
1324
1325
1326
1327
1328
1329
1330
1331 void
1332 remote_ra_fail(const char *node_name)
1333 {
1334 lrm_state_t *lrm_state = NULL;
1335
1336 CRM_CHECK(node_name != NULL, return);
1337
1338 lrm_state = controld_get_executor_state(node_name, false);
1339 if (lrm_state && lrm_state_is_connected(lrm_state)) {
1340 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1341
1342 crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1343 ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1344 ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1345 }
1346 }
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359 #define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1360 "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1361
1362
1363
1364
1365
1366
1367
1368 void
1369 remote_ra_process_pseudo(xmlNode *xml)
1370 {
1371 xmlXPathObject *search = pcmk__xpath_search(xml->doc, XPATH_PSEUDO_FENCE);
1372
1373 if (pcmk__xpath_num_results(search) == 1) {
1374 xmlNode *result = pcmk__xpath_result(search, 0);
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390 if (result) {
1391 const char *remote = pcmk__xe_id(result);
1392
1393 if (remote) {
1394 remote_node_down(remote, DOWN_ERASE_LRM);
1395 }
1396 }
1397 }
1398 xmlXPathFreeObject(search);
1399 }
1400
1401 static void
1402 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
1403 {
1404 xmlNode *update, *state;
1405 int call_opt;
1406 pcmk__node_status_t *node = NULL;
1407
1408 call_opt = crmd_cib_smart_opt();
1409 node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1410 CRM_CHECK(node != NULL, return);
1411 update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1412 state = create_node_state_update(node, controld_node_update_none, update,
1413 __func__);
1414 crm_xml_add(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1415 if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1416 NULL) == pcmk_rc_ok) {
1417
1418 if (maintenance) {
1419 lrm_remote_set_flags(lrm_state, remote_in_maint);
1420 } else {
1421 lrm_remote_clear_flags(lrm_state, remote_in_maint);
1422 }
1423 }
1424 pcmk__xml_free(update);
1425 }
1426
1427 #define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT \
1428 "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1429 PCMK__XE_MAINTENANCE
1430
1431
1432
1433
1434
1435
1436
1437 void
1438 remote_ra_process_maintenance_nodes(xmlNode *xml)
1439 {
1440 xmlXPathObject *search = pcmk__xpath_search(xml->doc,
1441 XPATH_PSEUDO_MAINTENANCE);
1442
1443 if (pcmk__xpath_num_results(search) == 1) {
1444 xmlNode *node;
1445 int cnt = 0, cnt_remote = 0;
1446
1447 for (node = pcmk__xe_first_child(pcmk__xpath_result(search, 0),
1448 PCMK_XE_NODE, NULL, NULL);
1449 node != NULL; node = pcmk__xe_next(node, PCMK_XE_NODE)) {
1450
1451 lrm_state_t *lrm_state = NULL;
1452 const char *id = pcmk__xe_id(node);
1453
1454 cnt++;
1455 if (id == NULL) {
1456 continue;
1457 }
1458
1459 lrm_state = controld_get_executor_state(id, false);
1460
1461 if (lrm_state && lrm_state->remote_ra_data &&
1462 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1463
1464 const char *in_maint_s = NULL;
1465 int in_maint;
1466
1467 cnt_remote++;
1468 in_maint_s = crm_element_value(node,
1469 PCMK__XA_NODE_IN_MAINTENANCE);
1470 pcmk__scan_min_int(in_maint_s, &in_maint, 0);
1471 remote_ra_maintenance(lrm_state, in_maint);
1472 }
1473 }
1474 crm_trace("Action holds %d nodes (%d remotes found) adjusting "
1475 PCMK_OPT_MAINTENANCE_MODE,
1476 cnt, cnt_remote);
1477 }
1478 xmlXPathFreeObject(search);
1479 }
1480
1481 gboolean
1482 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
1483 {
1484 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1485 return pcmk_is_set(ra_data->status, remote_in_maint);
1486 }
1487
1488 gboolean
1489 remote_ra_controlling_guest(lrm_state_t * lrm_state)
1490 {
1491 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1492 return pcmk_is_set(ra_data->status, controlling_guest);
1493 }