This source file includes following definitions.
- free_cmd
- generate_callid
- recurring_helper
- start_delay_helper
- should_purge_attributes
- section_to_delete
- purge_remote_node_attrs
- remote_node_up
- remote_node_down
- check_remote_node_state
- report_remote_ra_result
- update_remaining_timeout
- retry_start_cmd_cb
- connection_takeover_timeout_cb
- monitor_timeout_cb
- synthesize_lrmd_success
- remote_lrm_op_callback
- handle_remote_ra_stop
- handle_remote_ra_start
- handle_remote_ra_exec
- remote_ra_data_init
- remote_ra_cleanup
- is_remote_lrmd_ra
- remote_ra_get_rsc_info
- is_remote_ra_supported_action
- fail_all_monitor_cmds
- remove_cmd
- remote_ra_cancel
- handle_dup_monitor
- controld_execute_remote_agent
- remote_ra_fail
- remote_ra_process_pseudo
- remote_ra_maintenance
- remote_ra_process_maintenance_nodes
- remote_ra_is_in_maintenance
- remote_ra_controlling_guest
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <crm/crm.h>
13 #include <crm/common/xml.h>
14 #include <crm/common/xml_internal.h>
15 #include <crm/lrmd.h>
16 #include <crm/lrmd_internal.h>
17 #include <crm/services.h>
18
19 #include <pacemaker-controld.h>
20
21 #define REMOTE_LRMD_RA "remote"
22
23
24 #define MAX_START_TIMEOUT_MS 10000
25
26 #define cmd_set_flags(cmd, flags_to_set) do { \
27 (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
28 "Remote command", (cmd)->rsc_id, (cmd)->status, \
29 (flags_to_set), #flags_to_set); \
30 } while (0)
31
32 #define cmd_clear_flags(cmd, flags_to_clear) do { \
33 (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
34 "Remote command", (cmd)->rsc_id, (cmd)->status, \
35 (flags_to_clear), #flags_to_clear); \
36 } while (0)
37
38 enum remote_cmd_status {
39 cmd_reported_success = (1 << 0),
40 cmd_cancel = (1 << 1),
41 };
42
43 typedef struct remote_ra_cmd_s {
44
45 char *owner;
46
47 char *rsc_id;
48
49 char *action;
50
51 char *userdata;
52
53 int start_delay;
54
55 int delay_id;
56
57 int timeout;
58 int remaining_timeout;
59
60 guint interval_ms;
61
62 int interval_id;
63 int monitor_timeout_id;
64 int takeover_timeout_id;
65
66 lrmd_key_value_t *params;
67 pcmk__action_result_t result;
68 int call_id;
69 time_t start_time;
70 uint32_t status;
71 } remote_ra_cmd_t;
72
73 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
74 lrm_state_t *lrm = (lrm_state); \
75 remote_ra_data_t *ra = lrm->remote_ra_data; \
76 ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
77 lrm->node_name, ra->status, \
78 (flags_to_set), #flags_to_set); \
79 } while (0)
80
81 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
82 lrm_state_t *lrm = (lrm_state); \
83 remote_ra_data_t *ra = lrm->remote_ra_data; \
84 ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
85 lrm->node_name, ra->status, \
86 (flags_to_clear), #flags_to_clear); \
87 } while (0)
88
89 enum remote_status {
90 expect_takeover = (1 << 0),
91 takeover_complete = (1 << 1),
92 remote_active = (1 << 2),
93
94
95
96 remote_in_maint = (1 << 3),
97
98
99
100
101
102
103 controlling_guest = (1 << 4),
104 };
105
106 typedef struct remote_ra_data_s {
107 crm_trigger_t *work;
108 remote_ra_cmd_t *cur_cmd;
109 GList *cmds;
110 GList *recurring_cmds;
111 uint32_t status;
112 } remote_ra_data_t;
113
114 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
115 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
116 static GList *fail_all_monitor_cmds(GList * list);
117
118 static void
119 free_cmd(gpointer user_data)
120 {
121 remote_ra_cmd_t *cmd = user_data;
122
123 if (!cmd) {
124 return;
125 }
126 if (cmd->delay_id) {
127 g_source_remove(cmd->delay_id);
128 }
129 if (cmd->interval_id) {
130 g_source_remove(cmd->interval_id);
131 }
132 if (cmd->monitor_timeout_id) {
133 g_source_remove(cmd->monitor_timeout_id);
134 }
135 if (cmd->takeover_timeout_id) {
136 g_source_remove(cmd->takeover_timeout_id);
137 }
138 free(cmd->owner);
139 free(cmd->rsc_id);
140 free(cmd->action);
141 free(cmd->userdata);
142 pcmk__reset_result(&(cmd->result));
143 lrmd_key_value_freeall(cmd->params);
144 free(cmd);
145 }
146
147 static int
148 generate_callid(void)
149 {
150 static int remote_ra_callid = 0;
151
152 remote_ra_callid++;
153 if (remote_ra_callid <= 0) {
154 remote_ra_callid = 1;
155 }
156
157 return remote_ra_callid;
158 }
159
160 static gboolean
161 recurring_helper(gpointer data)
162 {
163 remote_ra_cmd_t *cmd = data;
164 lrm_state_t *connection_rsc = NULL;
165
166 cmd->interval_id = 0;
167 connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
168 if (connection_rsc && connection_rsc->remote_ra_data) {
169 remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
170
171 ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
172
173 ra_data->cmds = g_list_append(ra_data->cmds, cmd);
174 mainloop_set_trigger(ra_data->work);
175 }
176 return FALSE;
177 }
178
179 static gboolean
180 start_delay_helper(gpointer data)
181 {
182 remote_ra_cmd_t *cmd = data;
183 lrm_state_t *connection_rsc = NULL;
184
185 cmd->delay_id = 0;
186 connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
187 if (connection_rsc && connection_rsc->remote_ra_data) {
188 remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
189
190 mainloop_set_trigger(ra_data->work);
191 }
192 return FALSE;
193 }
194
195 static bool
196 should_purge_attributes(pcmk__node_status_t *node)
197 {
198 pcmk__node_status_t *conn_node = NULL;
199 lrm_state_t *connection_rsc = NULL;
200
201 if ((node->conn_host == NULL) || (node->name == NULL)) {
202 return true;
203 }
204
205
206
207
208 conn_node = pcmk__get_node(0, node->conn_host, NULL,
209 pcmk__node_search_cluster_member);
210 if (conn_node == NULL) {
211 return true;
212 }
213
214
215
216
217
218 connection_rsc = controld_get_executor_state(node->name, false);
219
220 if (connection_rsc != NULL) {
221 lrmd_t *lrm = connection_rsc->conn;
222 time_t uptime = lrmd__uptime(lrm);
223 time_t now = time(NULL);
224
225
226
227
228
229 if (uptime > 0 &&
230 conn_node->peer_lost > 0 &&
231 uptime + 20 >= now - conn_node->peer_lost) {
232 return false;
233 }
234 }
235
236 return true;
237 }
238
239 static enum controld_section_e
240 section_to_delete(bool purge)
241 {
242 if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
243 if (purge) {
244 return controld_section_all_unlocked;
245 } else {
246 return controld_section_lrm_unlocked;
247 }
248 } else {
249 if (purge) {
250 return controld_section_all;
251 } else {
252 return controld_section_lrm;
253 }
254 }
255 }
256
257 static void
258 purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
259 {
260 bool purge = should_purge_attributes(node);
261 enum controld_section_e section = section_to_delete(purge);
262
263
264 if (purge) {
265 update_attrd_remote_node_removed(node->name, NULL);
266 }
267
268 controld_delete_node_state(node->name, section, call_opt);
269 }
270
271
272
273
274
275
276
277 static void
278 remote_node_up(const char *node_name)
279 {
280 int call_opt;
281 xmlNode *update, *state;
282 pcmk__node_status_t *node = NULL;
283 lrm_state_t *connection_rsc = NULL;
284
285 CRM_CHECK(node_name != NULL, return);
286 crm_info("Announcing Pacemaker Remote node %s", node_name);
287
288 call_opt = crmd_cib_smart_opt();
289
290
291
292
293
294
295
296
297
298 update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
299
300
301 node = pcmk__cluster_lookup_remote_node(node_name);
302 CRM_CHECK((node != NULL) && (node->name != NULL), return);
303
304 purge_remote_node_attrs(call_opt, node);
305 pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
306
307
308
309
310 connection_rsc = controld_get_executor_state(node->name, false);
311
312 if (connection_rsc != NULL) {
313 lrmd_t *lrm = connection_rsc->conn;
314 const char *start_state = lrmd__node_start_state(lrm);
315
316 if (start_state) {
317 set_join_state(start_state, node->name, node->xml_id, true);
318 }
319 }
320
321
322
323
324
325
326
327 broadcast_remote_state_message(node_name, true);
328
329 update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
330 state = create_node_state_update(node, node_update_cluster, update,
331 __func__);
332
333
334
335
336
337 crm_xml_add(state, PCMK__XA_NODE_FENCED, "0");
338
339
340
341
342
343
344
345
346 controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
347 pcmk__xml_free(update);
348 }
349
350 enum down_opts {
351 DOWN_KEEP_LRM,
352 DOWN_ERASE_LRM
353 };
354
355
356
357
358
359
360
361
362 static void
363 remote_node_down(const char *node_name, const enum down_opts opts)
364 {
365 xmlNode *update;
366 int call_opt = crmd_cib_smart_opt();
367 pcmk__node_status_t *node = NULL;
368
369
370 update_attrd_remote_node_removed(node_name, NULL);
371
372
373
374
375
376
377 if (opts == DOWN_ERASE_LRM) {
378 controld_delete_node_state(node_name, controld_section_all, call_opt);
379 } else {
380 controld_delete_node_state(node_name, controld_section_attrs, call_opt);
381 }
382
383
384 node = pcmk__cluster_lookup_remote_node(node_name);
385 CRM_CHECK(node != NULL, return);
386 pcmk__update_peer_state(__func__, node, PCMK__VALUE_LOST, 0);
387
388
389 broadcast_remote_state_message(node_name, false);
390
391
392 update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
393 create_node_state_update(node, node_update_cluster, update, __func__);
394 controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
395 pcmk__xml_free(update);
396 }
397
398
399
400
401
402
403
404 static void
405 check_remote_node_state(const remote_ra_cmd_t *cmd)
406 {
407
408 if (!pcmk__result_ok(&(cmd->result))) {
409 return;
410 }
411
412 if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
413 remote_node_up(cmd->rsc_id);
414
415 } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
416 pcmk__str_casei)) {
417
418
419
420
421
422
423
424 pcmk__node_status_t *node =
425 pcmk__cluster_lookup_remote_node(cmd->rsc_id);
426
427 CRM_CHECK(node != NULL, return);
428 pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
429
430 } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
431 lrm_state_t *lrm_state = controld_get_executor_state(cmd->rsc_id,
432 false);
433 remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
434
435 if (ra_data) {
436 if (!pcmk_is_set(ra_data->status, takeover_complete)) {
437
438 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
439 } else if (AM_I_DC == FALSE) {
440
441
442
443
444 pcmk__cluster_forget_remote_node(cmd->rsc_id);
445 }
446 }
447 }
448
449
450
451
452
453
454
455
456
457
458 }
459
460 static void
461 report_remote_ra_result(remote_ra_cmd_t * cmd)
462 {
463 lrmd_event_data_t op = { 0, };
464
465 check_remote_node_state(cmd);
466
467 op.type = lrmd_event_exec_complete;
468 op.rsc_id = cmd->rsc_id;
469 op.op_type = cmd->action;
470 op.user_data = cmd->userdata;
471 op.timeout = cmd->timeout;
472 op.interval_ms = cmd->interval_ms;
473 op.t_run = cmd->start_time;
474 op.t_rcchange = cmd->start_time;
475
476 lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
477 cmd->result.exit_reason);
478
479 if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
480 op.t_rcchange = time(NULL);
481
482
483
484
485
486
487
488
489
490 if (op.t_rcchange == op.t_run) {
491 op.t_rcchange++;
492 }
493 }
494
495 if (cmd->params) {
496 lrmd_key_value_t *tmp;
497
498 op.params = pcmk__strkey_table(free, free);
499 for (tmp = cmd->params; tmp; tmp = tmp->next) {
500 pcmk__insert_dup(op.params, tmp->key, tmp->value);
501 }
502
503 }
504 op.call_id = cmd->call_id;
505 op.remote_nodename = cmd->owner;
506
507 lrm_op_callback(&op);
508
509 if (op.params) {
510 g_hash_table_destroy(op.params);
511 }
512 lrmd__reset_result(&op);
513 }
514
515 static void
516 update_remaining_timeout(remote_ra_cmd_t * cmd)
517 {
518 cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
519 }
520
521 static gboolean
522 retry_start_cmd_cb(gpointer data)
523 {
524 lrm_state_t *lrm_state = data;
525 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
526 remote_ra_cmd_t *cmd = NULL;
527 int rc = ETIME;
528
529 if (!ra_data || !ra_data->cur_cmd) {
530 return FALSE;
531 }
532 cmd = ra_data->cur_cmd;
533 if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
534 PCMK_ACTION_MIGRATE_FROM, NULL)) {
535 return FALSE;
536 }
537 update_remaining_timeout(cmd);
538
539 if (cmd->remaining_timeout > 0) {
540 rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
541 } else {
542 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
543 PCMK_EXEC_TIMEOUT,
544 "Not enough time remains to retry remote connection");
545 }
546
547 if (rc != pcmk_rc_ok) {
548 report_remote_ra_result(cmd);
549
550 if (ra_data->cmds) {
551 mainloop_set_trigger(ra_data->work);
552 }
553 ra_data->cur_cmd = NULL;
554 free_cmd(cmd);
555 } else {
556
557 }
558
559 return FALSE;
560 }
561
562
563 static gboolean
564 connection_takeover_timeout_cb(gpointer data)
565 {
566 lrm_state_t *lrm_state = NULL;
567 remote_ra_cmd_t *cmd = data;
568
569 crm_info("takeover event timed out for node %s", cmd->rsc_id);
570 cmd->takeover_timeout_id = 0;
571
572 lrm_state = controld_get_executor_state(cmd->rsc_id, false);
573
574 handle_remote_ra_stop(lrm_state, cmd);
575 free_cmd(cmd);
576
577 return FALSE;
578 }
579
580 static gboolean
581 monitor_timeout_cb(gpointer data)
582 {
583 lrm_state_t *lrm_state = NULL;
584 remote_ra_cmd_t *cmd = data;
585
586 lrm_state = controld_get_executor_state(cmd->rsc_id, false);
587
588 crm_info("Timed out waiting for remote poke response from %s%s",
589 cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
590 cmd->monitor_timeout_id = 0;
591 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
592 "Remote executor did not respond");
593
594 if (lrm_state && lrm_state->remote_ra_data) {
595 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
596
597 if (ra_data->cur_cmd == cmd) {
598 ra_data->cur_cmd = NULL;
599 }
600 if (ra_data->cmds) {
601 mainloop_set_trigger(ra_data->work);
602 }
603 }
604
605 report_remote_ra_result(cmd);
606 free_cmd(cmd);
607
608 if(lrm_state) {
609 lrm_state_disconnect(lrm_state);
610 }
611 return FALSE;
612 }
613
614 static void
615 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
616 {
617 lrmd_event_data_t op = { 0, };
618
619 if (lrm_state == NULL) {
620
621 lrm_state = controld_get_executor_state(NULL, false);
622 }
623 pcmk__assert(lrm_state != NULL);
624
625 op.type = lrmd_event_exec_complete;
626 op.rsc_id = rsc_id;
627 op.op_type = op_type;
628 op.t_run = time(NULL);
629 op.t_rcchange = op.t_run;
630 op.call_id = generate_callid();
631 lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
632 process_lrm_event(lrm_state, &op, NULL, NULL);
633 }
634
635 void
636 remote_lrm_op_callback(lrmd_event_data_t * op)
637 {
638 gboolean cmd_handled = FALSE;
639 lrm_state_t *lrm_state = NULL;
640 remote_ra_data_t *ra_data = NULL;
641 remote_ra_cmd_t *cmd = NULL;
642
643 CRM_CHECK((op != NULL) && (op->remote_nodename != NULL), return);
644
645 crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
646 "(%d) status=%s (%d)",
647 (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
648 lrmd_event_type2str(op->type), op->remote_nodename,
649 crm_exit_str((crm_exit_t) op->rc), op->rc,
650 pcmk_exec_status_str(op->op_status), op->op_status);
651
652 lrm_state = controld_get_executor_state(op->remote_nodename, false);
653 if (!lrm_state || !lrm_state->remote_ra_data) {
654 crm_debug("No state information found for remote connection event");
655 return;
656 }
657 ra_data = lrm_state->remote_ra_data;
658
659 if (op->type == lrmd_event_new_client) {
660
661
662 if (pcmk_is_set(ra_data->status, expect_takeover)) {
663
664 lrm_remote_clear_flags(lrm_state, expect_takeover);
665 lrm_remote_set_flags(lrm_state, takeover_complete);
666
667 } else {
668 crm_err("Disconnecting from Pacemaker Remote node %s due to "
669 "unexpected client takeover", op->remote_nodename);
670
671
672
673 lrm_state_disconnect_only(lrm_state);
674 }
675 return;
676 }
677
678
679 if (op->type == lrmd_event_exec_complete) {
680 if (pcmk_is_set(ra_data->status, takeover_complete)) {
681 crm_debug("ignoring event, this connection is taken over by another node");
682 } else {
683 lrm_op_callback(op);
684 }
685 return;
686 }
687
688 if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
689
690 if (!pcmk_is_set(ra_data->status, remote_active)) {
691 crm_debug("Disconnection from Pacemaker Remote node %s complete",
692 lrm_state->node_name);
693
694 } else if (!remote_ra_is_in_maintenance(lrm_state)) {
695 crm_err("Lost connection to Pacemaker Remote node %s",
696 lrm_state->node_name);
697 ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
698 ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
699
700 } else {
701 crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
702 lrm_state->node_name);
703
704 handle_remote_ra_stop(lrm_state, NULL);
705 remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
706
707 synthesize_lrmd_success(NULL, lrm_state->node_name,
708 PCMK_ACTION_STOP);
709 }
710 return;
711 }
712
713 if (!ra_data->cur_cmd) {
714 crm_debug("no event to match");
715 return;
716 }
717
718 cmd = ra_data->cur_cmd;
719
720
721
722 if ((op->type == lrmd_event_connect)
723 && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
724 PCMK_ACTION_MIGRATE_FROM, NULL)) {
725 if (op->connection_rc < 0) {
726 update_remaining_timeout(cmd);
727
728 if ((op->connection_rc == -ENOKEY)
729 || (op->connection_rc == -EKEYREJECTED)) {
730
731 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
732 PCMK_EXEC_ERROR,
733 pcmk_strerror(op->connection_rc));
734
735 } else if (cmd->remaining_timeout > 3000) {
736 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
737 pcmk__create_timer(1000, retry_start_cmd_cb, lrm_state);
738 return;
739
740 } else {
741 crm_trace("can't reschedule start, remaining timeout too small %d",
742 cmd->remaining_timeout);
743 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
744 PCMK_EXEC_TIMEOUT,
745 "%s without enough time to retry",
746 pcmk_strerror(op->connection_rc));
747 }
748
749 } else {
750 lrm_state_reset_tables(lrm_state, TRUE);
751 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
752 lrm_remote_set_flags(lrm_state, remote_active);
753 }
754
755 crm_debug("Remote connection event matched %s action", cmd->action);
756 report_remote_ra_result(cmd);
757 cmd_handled = TRUE;
758
759 } else if ((op->type == lrmd_event_poke)
760 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
761 pcmk__str_casei)) {
762
763 if (cmd->monitor_timeout_id) {
764 g_source_remove(cmd->monitor_timeout_id);
765 cmd->monitor_timeout_id = 0;
766 }
767
768
769
770
771 if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
772 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
773 report_remote_ra_result(cmd);
774 cmd_set_flags(cmd, cmd_reported_success);
775 }
776
777 crm_debug("Remote poke event matched %s action", cmd->action);
778
779
780 if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
781 ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
782 cmd->interval_id = pcmk__create_timer(cmd->interval_ms,
783 recurring_helper, cmd);
784 cmd = NULL;
785 }
786 cmd_handled = TRUE;
787
788 } else if ((op->type == lrmd_event_disconnect)
789 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
790 pcmk__str_casei)) {
791 if (pcmk_is_set(ra_data->status, remote_active) &&
792 !pcmk_is_set(cmd->status, cmd_cancel)) {
793 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
794 PCMK_EXEC_ERROR,
795 "Remote connection unexpectedly dropped "
796 "during monitor");
797 report_remote_ra_result(cmd);
798 crm_err("Remote connection to %s unexpectedly dropped during monitor",
799 lrm_state->node_name);
800 }
801 cmd_handled = TRUE;
802
803 } else {
804 crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
805 }
806
807 if (cmd_handled) {
808 ra_data->cur_cmd = NULL;
809 if (ra_data->cmds) {
810 mainloop_set_trigger(ra_data->work);
811 }
812 free_cmd(cmd);
813 }
814 }
815
816 static void
817 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
818 {
819 remote_ra_data_t *ra_data = NULL;
820
821 pcmk__assert(lrm_state != NULL);
822 ra_data = lrm_state->remote_ra_data;
823
824 if (!pcmk_is_set(ra_data->status, takeover_complete)) {
825
826 g_hash_table_remove_all(lrm_state->active_ops);
827 } else {
828
829
830 lrm_state_reset_tables(lrm_state, FALSE);
831 }
832
833 lrm_remote_clear_flags(lrm_state, remote_active);
834 lrm_state_disconnect(lrm_state);
835
836 if (ra_data->cmds) {
837 g_list_free_full(ra_data->cmds, free_cmd);
838 }
839 if (ra_data->recurring_cmds) {
840 g_list_free_full(ra_data->recurring_cmds, free_cmd);
841 }
842 ra_data->cmds = NULL;
843 ra_data->recurring_cmds = NULL;
844 ra_data->cur_cmd = NULL;
845
846 if (cmd) {
847 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
848 report_remote_ra_result(cmd);
849 }
850 }
851
852
853 static int
854 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
855 {
856 const char *server = NULL;
857 lrmd_key_value_t *tmp = NULL;
858 int port = 0;
859 int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
860 int rc = pcmk_rc_ok;
861
862 for (tmp = cmd->params; tmp; tmp = tmp->next) {
863 if (pcmk__strcase_any_of(tmp->key,
864 PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
865 NULL)) {
866 server = tmp->value;
867
868 } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
869 pcmk__str_none)) {
870 port = atoi(tmp->value);
871
872 } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
873 pcmk__str_none)) {
874 lrm_remote_set_flags(lrm_state, controlling_guest);
875 }
876 }
877
878 rc = controld_connect_remote_executor(lrm_state, server, port,
879 timeout_used);
880 if (rc != pcmk_rc_ok) {
881 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
882 PCMK_EXEC_ERROR,
883 "Could not connect to Pacemaker Remote node %s: %s",
884 lrm_state->node_name, pcmk_rc_str(rc));
885 }
886 return rc;
887 }
888
889 static gboolean
890 handle_remote_ra_exec(gpointer user_data)
891 {
892 int rc = 0;
893 lrm_state_t *lrm_state = user_data;
894 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
895 remote_ra_cmd_t *cmd;
896 GList *first = NULL;
897
898 if (ra_data->cur_cmd) {
899
900 return TRUE;
901 }
902
903 while (ra_data->cmds) {
904 first = ra_data->cmds;
905 cmd = first->data;
906 if (cmd->delay_id) {
907
908 return TRUE;
909 }
910
911 ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
912 g_list_free_1(first);
913
914 if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
915 PCMK_ACTION_MIGRATE_FROM, NULL)) {
916 lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
917 if (handle_remote_ra_start(lrm_state, cmd,
918 cmd->timeout) == pcmk_rc_ok) {
919
920 crm_debug("Initiated async remote connection, %s action will complete after connect event",
921 cmd->action);
922 ra_data->cur_cmd = cmd;
923 return TRUE;
924 }
925 report_remote_ra_result(cmd);
926
927 } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
928
929 if (lrm_state_is_connected(lrm_state) == TRUE) {
930 rc = lrm_state_poke_connection(lrm_state);
931 if (rc < 0) {
932 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
933 PCMK_EXEC_ERROR, pcmk_strerror(rc));
934 }
935 } else {
936 rc = -1;
937 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
938 PCMK_EXEC_DONE, "Remote connection inactive");
939 }
940
941 if (rc == 0) {
942 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
943 cmd->rsc_id);
944 ra_data->cur_cmd = cmd;
945 cmd->monitor_timeout_id = pcmk__create_timer(cmd->timeout, monitor_timeout_cb, cmd);
946 return TRUE;
947 }
948 report_remote_ra_result(cmd);
949
950 } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
951
952 if (pcmk_is_set(ra_data->status, expect_takeover)) {
953
954
955
956
957
958
959 cmd->takeover_timeout_id = pcmk__create_timer((cmd->timeout/2),
960 connection_takeover_timeout_cb,
961 cmd);
962 ra_data->cur_cmd = cmd;
963 return TRUE;
964 }
965
966 handle_remote_ra_stop(lrm_state, cmd);
967
968 } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
969 lrm_remote_clear_flags(lrm_state, takeover_complete);
970 lrm_remote_set_flags(lrm_state, expect_takeover);
971 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
972 report_remote_ra_result(cmd);
973
974 } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
975 PCMK_ACTION_RELOAD_AGENT, NULL)) {
976
977
978
979
980
981
982
983
984
985 pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
986 report_remote_ra_result(cmd);
987 }
988
989 free_cmd(cmd);
990 }
991
992 return TRUE;
993 }
994
995 static void
996 remote_ra_data_init(lrm_state_t * lrm_state)
997 {
998 remote_ra_data_t *ra_data = NULL;
999
1000 if (lrm_state->remote_ra_data) {
1001 return;
1002 }
1003
1004 ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
1005 ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1006 lrm_state->remote_ra_data = ra_data;
1007 }
1008
1009 void
1010 remote_ra_cleanup(lrm_state_t * lrm_state)
1011 {
1012 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1013
1014 if (!ra_data) {
1015 return;
1016 }
1017
1018 if (ra_data->cmds) {
1019 g_list_free_full(ra_data->cmds, free_cmd);
1020 }
1021
1022 if (ra_data->recurring_cmds) {
1023 g_list_free_full(ra_data->recurring_cmds, free_cmd);
1024 }
1025 mainloop_destroy_trigger(ra_data->work);
1026 free(ra_data);
1027 lrm_state->remote_ra_data = NULL;
1028 }
1029
1030 gboolean
1031 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
1032 {
1033 if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1034 return TRUE;
1035 }
1036 return (id != NULL) && (controld_get_executor_state(id, false) != NULL)
1037 && !controld_is_local_node(id);
1038 }
1039
1040 lrmd_rsc_info_t *
1041 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
1042 {
1043 lrmd_rsc_info_t *info = NULL;
1044
1045 CRM_CHECK(rsc_id != NULL, return NULL);
1046
1047 if (controld_get_executor_state(rsc_id, false) != NULL) {
1048 info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
1049
1050 info->id = pcmk__str_copy(rsc_id);
1051 info->type = pcmk__str_copy(REMOTE_LRMD_RA);
1052 info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
1053 info->provider = pcmk__str_copy("pacemaker");
1054 }
1055
1056 return info;
1057 }
1058
1059 static gboolean
1060 is_remote_ra_supported_action(const char *action)
1061 {
1062 return pcmk__str_any_of(action,
1063 PCMK_ACTION_START,
1064 PCMK_ACTION_STOP,
1065 PCMK_ACTION_MONITOR,
1066 PCMK_ACTION_MIGRATE_TO,
1067 PCMK_ACTION_MIGRATE_FROM,
1068 PCMK_ACTION_RELOAD_AGENT,
1069 PCMK_ACTION_RELOAD,
1070 NULL);
1071 }
1072
1073 static GList *
1074 fail_all_monitor_cmds(GList * list)
1075 {
1076 GList *rm_list = NULL;
1077 remote_ra_cmd_t *cmd = NULL;
1078 GList *gIter = NULL;
1079
1080 for (gIter = list; gIter != NULL; gIter = gIter->next) {
1081 cmd = gIter->data;
1082 if ((cmd->interval_ms > 0)
1083 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1084 pcmk__str_casei)) {
1085 rm_list = g_list_append(rm_list, cmd);
1086 }
1087 }
1088
1089 for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1090 cmd = gIter->data;
1091
1092 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1093 PCMK_EXEC_ERROR, "Lost connection to remote executor");
1094 crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1095 cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1096 report_remote_ra_result(cmd);
1097
1098 list = g_list_remove(list, cmd);
1099 free_cmd(cmd);
1100 }
1101
1102
1103 g_list_free(rm_list);
1104 return list;
1105 }
1106
1107 static GList *
1108 remove_cmd(GList * list, const char *action, guint interval_ms)
1109 {
1110 remote_ra_cmd_t *cmd = NULL;
1111 GList *gIter = NULL;
1112
1113 for (gIter = list; gIter != NULL; gIter = gIter->next) {
1114 cmd = gIter->data;
1115 if ((cmd->interval_ms == interval_ms)
1116 && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1117 break;
1118 }
1119 cmd = NULL;
1120 }
1121 if (cmd) {
1122 list = g_list_remove(list, cmd);
1123 free_cmd(cmd);
1124 }
1125 return list;
1126 }
1127
1128 int
1129 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
1130 const char *action, guint interval_ms)
1131 {
1132 lrm_state_t *connection_rsc = NULL;
1133 remote_ra_data_t *ra_data = NULL;
1134
1135 CRM_CHECK(rsc_id != NULL, return -EINVAL);
1136
1137 connection_rsc = controld_get_executor_state(rsc_id, false);
1138 if (!connection_rsc || !connection_rsc->remote_ra_data) {
1139 return -EINVAL;
1140 }
1141
1142 ra_data = connection_rsc->remote_ra_data;
1143 ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1144 ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1145 interval_ms);
1146 if (ra_data->cur_cmd &&
1147 (ra_data->cur_cmd->interval_ms == interval_ms) &&
1148 (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1149
1150 cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1151 }
1152
1153 return 0;
1154 }
1155
1156 static remote_ra_cmd_t *
1157 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
1158 const char *userdata)
1159 {
1160 GList *gIter = NULL;
1161 remote_ra_cmd_t *cmd = NULL;
1162
1163
1164
1165
1166
1167
1168
1169 if (interval_ms == 0) {
1170 return NULL;
1171 }
1172
1173 if (ra_data->cur_cmd &&
1174 !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1175 (ra_data->cur_cmd->interval_ms == interval_ms)
1176 && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1177 pcmk__str_casei)) {
1178
1179 cmd = ra_data->cur_cmd;
1180 goto handle_dup;
1181 }
1182
1183 for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1184 cmd = gIter->data;
1185 if ((cmd->interval_ms == interval_ms)
1186 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1187 pcmk__str_casei)) {
1188 goto handle_dup;
1189 }
1190 }
1191
1192 for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1193 cmd = gIter->data;
1194 if ((cmd->interval_ms == interval_ms)
1195 && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1196 pcmk__str_casei)) {
1197 goto handle_dup;
1198 }
1199 }
1200
1201 return NULL;
1202
1203 handle_dup:
1204
1205 crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1206 cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1207
1208
1209 if (userdata) {
1210 free(cmd->userdata);
1211 cmd->userdata = pcmk__str_copy(userdata);
1212 }
1213
1214
1215 if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1216 cmd->start_time = time(NULL);
1217 cmd->call_id = generate_callid();
1218 cmd_clear_flags(cmd, cmd_reported_success);
1219 }
1220
1221
1222
1223
1224 if (cmd->interval_id) {
1225 g_source_remove(cmd->interval_id);
1226 cmd->interval_id = 0;
1227 recurring_helper(cmd);
1228 }
1229
1230 return cmd;
1231 }
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251 int
1252 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
1253 const char *action, const char *userdata,
1254 guint interval_ms, int timeout_ms,
1255 int start_delay_ms, lrmd_key_value_t *params,
1256 int *call_id)
1257 {
1258 lrm_state_t *connection_rsc = NULL;
1259 remote_ra_cmd_t *cmd = NULL;
1260 remote_ra_data_t *ra_data = NULL;
1261
1262 *call_id = 0;
1263
1264 CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1265 && (userdata != NULL) && (call_id != NULL),
1266 lrmd_key_value_freeall(params); return EINVAL);
1267
1268 if (!is_remote_ra_supported_action(action)) {
1269 lrmd_key_value_freeall(params);
1270 return EOPNOTSUPP;
1271 }
1272
1273 connection_rsc = controld_get_executor_state(rsc_id, false);
1274 if (connection_rsc == NULL) {
1275 lrmd_key_value_freeall(params);
1276 return ENOTCONN;
1277 }
1278
1279 remote_ra_data_init(connection_rsc);
1280 ra_data = connection_rsc->remote_ra_data;
1281
1282 cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1283 if (cmd) {
1284 *call_id = cmd->call_id;
1285 lrmd_key_value_freeall(params);
1286 return pcmk_rc_ok;
1287 }
1288
1289 cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1290
1291 cmd->owner = pcmk__str_copy(lrm_state->node_name);
1292 cmd->rsc_id = pcmk__str_copy(rsc_id);
1293 cmd->action = pcmk__str_copy(action);
1294 cmd->userdata = pcmk__str_copy(userdata);
1295 cmd->interval_ms = interval_ms;
1296 cmd->timeout = timeout_ms;
1297 cmd->start_delay = start_delay_ms;
1298 cmd->params = params;
1299 cmd->start_time = time(NULL);
1300
1301 cmd->call_id = generate_callid();
1302
1303 if (cmd->start_delay) {
1304 cmd->delay_id = pcmk__create_timer(cmd->start_delay, start_delay_helper, cmd);
1305 }
1306
1307 ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1308 mainloop_set_trigger(ra_data->work);
1309
1310 *call_id = cmd->call_id;
1311 return pcmk_rc_ok;
1312 }
1313
1314
1315
1316
1317
1318
1319
1320 void
1321 remote_ra_fail(const char *node_name)
1322 {
1323 lrm_state_t *lrm_state = NULL;
1324
1325 CRM_CHECK(node_name != NULL, return);
1326
1327 lrm_state = controld_get_executor_state(node_name, false);
1328 if (lrm_state && lrm_state_is_connected(lrm_state)) {
1329 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1330
1331 crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1332 ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1333 ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1334 }
1335 }
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348 #define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1349 "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1350
1351
1352
1353
1354
1355
1356
1357 void
1358 remote_ra_process_pseudo(xmlNode *xml)
1359 {
1360 xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1361
1362 if (numXpathResults(search) == 1) {
1363 xmlNode *result = getXpathResult(search, 0);
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379 if (result) {
1380 const char *remote = pcmk__xe_id(result);
1381
1382 if (remote) {
1383 remote_node_down(remote, DOWN_ERASE_LRM);
1384 }
1385 }
1386 }
1387 freeXpathObject(search);
1388 }
1389
1390 static void
1391 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
1392 {
1393 xmlNode *update, *state;
1394 int call_opt;
1395 pcmk__node_status_t *node = NULL;
1396
1397 call_opt = crmd_cib_smart_opt();
1398 node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1399 CRM_CHECK(node != NULL, return);
1400 update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1401 state = create_node_state_update(node, node_update_none, update,
1402 __func__);
1403 crm_xml_add(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1404 if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1405 NULL) == pcmk_rc_ok) {
1406
1407 if (maintenance) {
1408 lrm_remote_set_flags(lrm_state, remote_in_maint);
1409 } else {
1410 lrm_remote_clear_flags(lrm_state, remote_in_maint);
1411 }
1412 }
1413 pcmk__xml_free(update);
1414 }
1415
1416 #define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT \
1417 "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1418 PCMK__XE_MAINTENANCE
1419
1420
1421
1422
1423
1424
1425
1426 void
1427 remote_ra_process_maintenance_nodes(xmlNode *xml)
1428 {
1429 xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1430
1431 if (numXpathResults(search) == 1) {
1432 xmlNode *node;
1433 int cnt = 0, cnt_remote = 0;
1434
1435 for (node = pcmk__xe_first_child(getXpathResult(search, 0),
1436 PCMK_XE_NODE, NULL, NULL);
1437 node != NULL; node = pcmk__xe_next(node, PCMK_XE_NODE)) {
1438
1439 lrm_state_t *lrm_state = NULL;
1440 const char *id = pcmk__xe_id(node);
1441
1442 cnt++;
1443 if (id == NULL) {
1444 continue;
1445 }
1446
1447 lrm_state = controld_get_executor_state(id, false);
1448
1449 if (lrm_state && lrm_state->remote_ra_data &&
1450 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1451
1452 const char *in_maint_s = NULL;
1453 int in_maint;
1454
1455 cnt_remote++;
1456 in_maint_s = crm_element_value(node,
1457 PCMK__XA_NODE_IN_MAINTENANCE);
1458 pcmk__scan_min_int(in_maint_s, &in_maint, 0);
1459 remote_ra_maintenance(lrm_state, in_maint);
1460 }
1461 }
1462 crm_trace("Action holds %d nodes (%d remotes found) adjusting "
1463 PCMK_OPT_MAINTENANCE_MODE,
1464 cnt, cnt_remote);
1465 }
1466 freeXpathObject(search);
1467 }
1468
1469 gboolean
1470 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
1471 {
1472 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1473 return pcmk_is_set(ra_data->status, remote_in_maint);
1474 }
1475
1476 gboolean
1477 remote_ra_controlling_guest(lrm_state_t * lrm_state)
1478 {
1479 remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1480 return pcmk_is_set(ra_data->status, controlling_guest);
1481 }