root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. remote_node_up
  6. remote_node_down
  7. check_remote_node_state
  8. report_remote_ra_result
  9. update_remaining_timeout
  10. retry_start_cmd_cb
  11. connection_takeover_timeout_cb
  12. monitor_timeout_cb
  13. synthesize_lrmd_success
  14. remote_lrm_op_callback
  15. handle_remote_ra_stop
  16. handle_remote_ra_start
  17. handle_remote_ra_exec
  18. remote_ra_data_init
  19. remote_ra_cleanup
  20. is_remote_lrmd_ra
  21. remote_ra_get_rsc_info
  22. is_remote_ra_supported_action
  23. fail_all_monitor_cmds
  24. remove_cmd
  25. remote_ra_cancel
  26. handle_dup_monitor
  27. controld_execute_remote_agent
  28. remote_ra_fail
  29. remote_ra_process_pseudo
  30. remote_ra_maintenance
  31. remote_ra_process_maintenance_nodes
  32. remote_ra_is_in_maintenance
  33. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2022 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 typedef struct remote_ra_cmd_s {
  27     /*! the local node the cmd is issued from */
  28     char *owner;
  29     /*! the remote node the cmd is executed on */
  30     char *rsc_id;
  31     /*! the action to execute */
  32     char *action;
  33     /*! some string the client wants us to give it back */
  34     char *userdata;
  35     /*! start delay in ms */
  36     int start_delay;
  37     /*! timer id used for start delay. */
  38     int delay_id;
  39     /*! timeout in ms for cmd */
  40     int timeout;
  41     int remaining_timeout;
  42     /*! recurring interval in ms */
  43     guint interval_ms;
  44     /*! interval timer id */
  45     int interval_id;
  46     int reported_success;
  47     int monitor_timeout_id;
  48     int takeover_timeout_id;
  49     /*! action parameters */
  50     lrmd_key_value_t *params;
  51     pcmk__action_result_t result;
  52     int call_id;
  53     time_t start_time;
  54     gboolean cancel;
  55 } remote_ra_cmd_t;
  56 
  57 enum remote_migration_status {
  58     expect_takeover = 1,
  59     takeover_complete,
  60 };
  61 
  62 typedef struct remote_ra_data_s {
  63     crm_trigger_t *work;
  64     remote_ra_cmd_t *cur_cmd;
  65     GList *cmds;
  66     GList *recurring_cmds;
  67 
  68     enum remote_migration_status migrate_status;
  69 
  70     gboolean active;
  71 
  72     /* Maintenance mode is difficult to determine from the controller's context,
  73      * so we have it signalled back with the transition from the scheduler.
  74      */
  75     gboolean is_maintenance;
  76 
  77     /* Similar for whether we are controlling a guest node or remote node.
  78      * Fortunately there is a meta-attribute in the transition already and
  79      * as the situation doesn't change over time we can use the
  80      * resource start for noting down the information for later use when
  81      * the attributes aren't at hand.
  82      */
  83     gboolean controlling_guest;
  84 } remote_ra_data_t;
  85 
  86 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
  87 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
  88 static GList *fail_all_monitor_cmds(GList * list);
  89 
  90 static void
  91 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  92 {
  93     remote_ra_cmd_t *cmd = user_data;
  94 
  95     if (!cmd) {
  96         return;
  97     }
  98     if (cmd->delay_id) {
  99         g_source_remove(cmd->delay_id);
 100     }
 101     if (cmd->interval_id) {
 102         g_source_remove(cmd->interval_id);
 103     }
 104     if (cmd->monitor_timeout_id) {
 105         g_source_remove(cmd->monitor_timeout_id);
 106     }
 107     if (cmd->takeover_timeout_id) {
 108         g_source_remove(cmd->takeover_timeout_id);
 109     }
 110     free(cmd->owner);
 111     free(cmd->rsc_id);
 112     free(cmd->action);
 113     free(cmd->userdata);
 114     pcmk__reset_result(&(cmd->result));
 115     lrmd_key_value_freeall(cmd->params);
 116     free(cmd);
 117 }
 118 
 119 static int
 120 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 121 {
 122     static int remote_ra_callid = 0;
 123 
 124     remote_ra_callid++;
 125     if (remote_ra_callid <= 0) {
 126         remote_ra_callid = 1;
 127     }
 128 
 129     return remote_ra_callid;
 130 }
 131 
 132 static gboolean
 133 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 134 {
 135     remote_ra_cmd_t *cmd = data;
 136     lrm_state_t *connection_rsc = NULL;
 137 
 138     cmd->interval_id = 0;
 139     connection_rsc = lrm_state_find(cmd->rsc_id);
 140     if (connection_rsc && connection_rsc->remote_ra_data) {
 141         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 142 
 143         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 144 
 145         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 146         mainloop_set_trigger(ra_data->work);
 147     }
 148     return FALSE;
 149 }
 150 
 151 static gboolean
 152 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 153 {
 154     remote_ra_cmd_t *cmd = data;
 155     lrm_state_t *connection_rsc = NULL;
 156 
 157     cmd->delay_id = 0;
 158     connection_rsc = lrm_state_find(cmd->rsc_id);
 159     if (connection_rsc && connection_rsc->remote_ra_data) {
 160         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 161 
 162         mainloop_set_trigger(ra_data->work);
 163     }
 164     return FALSE;
 165 }
 166 
 167 /*!
 168  * \internal
 169  * \brief Handle cluster communication related to pacemaker_remote node joining
 170  *
 171  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 172  */
 173 static void
 174 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 175 {
 176     int call_opt, call_id = 0;
 177     xmlNode *update, *state;
 178     crm_node_t *node;
 179     enum controld_section_e section = controld_section_all;
 180 
 181     CRM_CHECK(node_name != NULL, return);
 182     crm_info("Announcing Pacemaker Remote node %s", node_name);
 183 
 184     /* Clear node's entire state (resource history and transient attributes)
 185      * other than shutdown locks. The transient attributes should and normally
 186      * will be cleared when the node leaves, but since remote node state has a
 187      * number of corner cases, clear them here as well, to be sure.
 188      */
 189     call_opt = crmd_cib_smart_opt();
 190     if (controld_shutdown_lock_enabled) {
 191         section = controld_section_all_unlocked;
 192     }
 193     /* Purge node from attrd's memory */
 194     update_attrd_remote_node_removed(node_name, NULL);
 195 
 196     controld_delete_node_state(node_name, section, call_opt);
 197 
 198     /* Delete node's probe_complete attribute. This serves two purposes:
 199      *
 200      * - @COMPAT DCs < 1.1.14 in a rolling upgrade might use it
 201      * - deleting it (or any attribute for that matter) here ensures the
 202      *   attribute manager learns the node is remote
 203      */
 204     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 205 
 206     /* Ensure node is in the remote peer cache with member status */
 207     node = crm_remote_peer_get(node_name);
 208     CRM_CHECK(node != NULL, return);
 209     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 210 
 211     /* pacemaker_remote nodes don't participate in the membership layer,
 212      * so cluster nodes don't automatically get notified when they come and go.
 213      * We send a cluster message to the DC, and update the CIB node state entry,
 214      * so the DC will get it sooner (via message) or later (via CIB refresh),
 215      * and any other interested parties can query the CIB.
 216      */
 217     send_remote_state_message(node_name, TRUE);
 218 
 219     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 220     state = create_node_state_update(node, node_update_cluster, update,
 221                                      __func__);
 222 
 223     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 224      * needs to be fenced, this flag will allow various actions to determine
 225      * whether the fencing has happened yet.
 226      */
 227     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 228 
 229     /* TODO: If the remote connection drops, and this (async) CIB update either
 230      * failed or has not yet completed, later actions could mistakenly think the
 231      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 232      * previously set, because it won't have been cleared). This could prevent
 233      * actual fencing or allow recurring monitor failures to be cleared too
 234      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 235      */
 236     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 237     if (call_id < 0) {
 238         crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
 239     }
 240     free_xml(update);
 241 }
 242 
 243 enum down_opts {
 244     DOWN_KEEP_LRM,
 245     DOWN_ERASE_LRM
 246 };
 247 
 248 /*!
 249  * \internal
 250  * \brief Handle cluster communication related to pacemaker_remote node leaving
 251  *
 252  * \param[in] node_name  Name of lost node
 253  * \param[in] opts       Whether to keep or erase LRM history
 254  */
 255 static void
 256 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 257 {
 258     xmlNode *update;
 259     int call_id = 0;
 260     int call_opt = crmd_cib_smart_opt();
 261     crm_node_t *node;
 262 
 263     /* Purge node from attrd's memory */
 264     update_attrd_remote_node_removed(node_name, NULL);
 265 
 266     /* Normally, only node attributes should be erased, and the resource history
 267      * should be kept until the node comes back up. However, after a successful
 268      * fence, we want to clear the history as well, so we don't think resources
 269      * are still running on the node.
 270      */
 271     if (opts == DOWN_ERASE_LRM) {
 272         controld_delete_node_state(node_name, controld_section_all, call_opt);
 273     } else {
 274         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 275     }
 276 
 277     /* Ensure node is in the remote peer cache with lost state */
 278     node = crm_remote_peer_get(node_name);
 279     CRM_CHECK(node != NULL, return);
 280     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 281 
 282     /* Notify DC */
 283     send_remote_state_message(node_name, FALSE);
 284 
 285     /* Update CIB node state */
 286     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 287     create_node_state_update(node, node_update_cluster, update, __func__);
 288     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 289     if (call_id < 0) {
 290         crm_perror(LOG_ERR, "%s CIB node state update", node_name);
 291     }
 292     free_xml(update);
 293 }
 294 
 295 /*!
 296  * \internal
 297  * \brief Handle effects of a remote RA command on node state
 298  *
 299  * \param[in] cmd  Completed remote RA command
 300  */
 301 static void
 302 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 303 {
 304     /* Only successful actions can change node state */
 305     if (!pcmk__result_ok(&(cmd->result))) {
 306         return;
 307     }
 308 
 309     if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
 310         remote_node_up(cmd->rsc_id);
 311 
 312     } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) {
 313         /* After a successful migration, we don't need to do remote_node_up()
 314          * because the DC already knows the node is up, and we don't want to
 315          * clear LRM history etc. We do need to add the remote node to this
 316          * host's remote peer cache, because (unless it happens to be DC)
 317          * it hasn't been tracking the remote node, and other code relies on
 318          * the cache to distinguish remote nodes from unseen cluster nodes.
 319          */
 320         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 321 
 322         CRM_CHECK(node != NULL, return);
 323         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 324 
 325     } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 326         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 327         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 328 
 329         if (ra_data) {
 330             if (ra_data->migrate_status != takeover_complete) {
 331                 /* Stop means down if we didn't successfully migrate elsewhere */
 332                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 333             } else if (AM_I_DC == FALSE) {
 334                 /* Only the connection host and DC track node state,
 335                  * so if the connection migrated elsewhere and we aren't DC,
 336                  * un-cache the node, so we don't have stale info
 337                  */
 338                 crm_remote_peer_cache_remove(cmd->rsc_id);
 339             }
 340         }
 341     }
 342 
 343     /* We don't do anything for successful monitors, which is correct for
 344      * routine recurring monitors, and for monitors on nodes where the
 345      * connection isn't supposed to be (the cluster will stop the connection in
 346      * that case). However, if the initial probe finds the connection already
 347      * active on the node where we want it, we probably should do
 348      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 349      * Given that connections have to be initiated by the cluster, the chance of
 350      * that should be close to zero.
 351      */
 352 }
 353 
 354 static void
 355 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 356 {
 357     lrmd_event_data_t op = { 0, };
 358 
 359     check_remote_node_state(cmd);
 360 
 361     op.type = lrmd_event_exec_complete;
 362     op.rsc_id = cmd->rsc_id;
 363     op.op_type = cmd->action;
 364     op.user_data = cmd->userdata;
 365     op.timeout = cmd->timeout;
 366     op.interval_ms = cmd->interval_ms;
 367     op.t_run = (unsigned int) cmd->start_time;
 368     op.t_rcchange = (unsigned int) cmd->start_time;
 369 
 370     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 371                      cmd->result.exit_reason);
 372 
 373     if (cmd->reported_success && !pcmk__result_ok(&(cmd->result))) {
 374         op.t_rcchange = (unsigned int) time(NULL);
 375         /* This edge case will likely never ever occur, but if it does the
 376          * result is that a failure will not be processed correctly. This is only
 377          * remotely possible because we are able to detect a connection resource's tcp
 378          * connection has failed at any moment after start has completed. The actual
 379          * recurring operation is just a connectivity ping.
 380          *
 381          * basically, we are not guaranteed that the first successful monitor op and
 382          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 383          * make it look like the operations occurred at separate times though. */
 384         if (op.t_rcchange == op.t_run) {
 385             op.t_rcchange++;
 386         }
 387     }
 388 
 389     if (cmd->params) {
 390         lrmd_key_value_t *tmp;
 391 
 392         op.params = pcmk__strkey_table(free, free);
 393         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 394             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 395         }
 396 
 397     }
 398     op.call_id = cmd->call_id;
 399     op.remote_nodename = cmd->owner;
 400 
 401     lrm_op_callback(&op);
 402 
 403     if (op.params) {
 404         g_hash_table_destroy(op.params);
 405     }
 406     lrmd__reset_result(&op);
 407 }
 408 
 409 static void
 410 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 411 {
 412     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 413 }
 414 
 415 static gboolean
 416 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 417 {
 418     lrm_state_t *lrm_state = data;
 419     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 420     remote_ra_cmd_t *cmd = NULL;
 421     int rc = ETIME;
 422 
 423     if (!ra_data || !ra_data->cur_cmd) {
 424         return FALSE;
 425     }
 426     cmd = ra_data->cur_cmd;
 427     if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) {
 428         return FALSE;
 429     }
 430     update_remaining_timeout(cmd);
 431 
 432     if (cmd->remaining_timeout > 0) {
 433         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 434     } else {
 435         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 436                          PCMK_EXEC_TIMEOUT,
 437                          "Not enough time remains to retry remote connection");
 438     }
 439 
 440     if (rc != pcmk_rc_ok) {
 441         report_remote_ra_result(cmd);
 442 
 443         if (ra_data->cmds) {
 444             mainloop_set_trigger(ra_data->work);
 445         }
 446         ra_data->cur_cmd = NULL;
 447         free_cmd(cmd);
 448     } else {
 449         /* wait for connection event */
 450     }
 451 
 452     return FALSE;
 453 }
 454 
 455 
 456 static gboolean
 457 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 458 {
 459     lrm_state_t *lrm_state = NULL;
 460     remote_ra_cmd_t *cmd = data;
 461 
 462     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 463     cmd->takeover_timeout_id = 0;
 464 
 465     lrm_state = lrm_state_find(cmd->rsc_id);
 466 
 467     handle_remote_ra_stop(lrm_state, cmd);
 468     free_cmd(cmd);
 469 
 470     return FALSE;
 471 }
 472 
 473 static gboolean
 474 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 475 {
 476     lrm_state_t *lrm_state = NULL;
 477     remote_ra_cmd_t *cmd = data;
 478 
 479     lrm_state = lrm_state_find(cmd->rsc_id);
 480 
 481     crm_info("Timed out waiting for remote poke response from %s%s",
 482              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 483     cmd->monitor_timeout_id = 0;
 484     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 485                      "Remote executor did not respond");
 486 
 487     if (lrm_state && lrm_state->remote_ra_data) {
 488         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 489 
 490         if (ra_data->cur_cmd == cmd) {
 491             ra_data->cur_cmd = NULL;
 492         }
 493         if (ra_data->cmds) {
 494             mainloop_set_trigger(ra_data->work);
 495         }
 496     }
 497 
 498     report_remote_ra_result(cmd);
 499     free_cmd(cmd);
 500 
 501     if(lrm_state) {
 502         lrm_state_disconnect(lrm_state);
 503     }
 504     return FALSE;
 505 }
 506 
 507 static void
 508 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 509 {
 510     lrmd_event_data_t op = { 0, };
 511 
 512     if (lrm_state == NULL) {
 513         /* if lrm_state not given assume local */
 514         lrm_state = lrm_state_find(fsa_our_uname);
 515     }
 516     CRM_ASSERT(lrm_state != NULL);
 517 
 518     op.type = lrmd_event_exec_complete;
 519     op.rsc_id = rsc_id;
 520     op.op_type = op_type;
 521     op.t_run = (unsigned int) time(NULL);
 522     op.t_rcchange = op.t_run;
 523     op.call_id = generate_callid();
 524     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 525     process_lrm_event(lrm_state, &op, NULL, NULL);
 526 }
 527 
 528 void
 529 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 530 {
 531     gboolean cmd_handled = FALSE;
 532     lrm_state_t *lrm_state = NULL;
 533     remote_ra_data_t *ra_data = NULL;
 534     remote_ra_cmd_t *cmd = NULL;
 535 
 536     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 537               "(%d) status=%s (%d)",
 538               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 539               lrmd_event_type2str(op->type), op->remote_nodename,
 540               services_ocf_exitcode_str(op->rc), op->rc,
 541               pcmk_exec_status_str(op->op_status), op->op_status);
 542 
 543     lrm_state = lrm_state_find(op->remote_nodename);
 544     if (!lrm_state || !lrm_state->remote_ra_data) {
 545         crm_debug("No state information found for remote connection event");
 546         return;
 547     }
 548     ra_data = lrm_state->remote_ra_data;
 549 
 550     if (op->type == lrmd_event_new_client) {
 551         // Another client has connected to the remote daemon
 552 
 553         if (ra_data->migrate_status == expect_takeover) {
 554             // Great, we knew this was coming
 555             ra_data->migrate_status = takeover_complete;
 556 
 557         } else {
 558             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 559                     "unexpected client takeover", op->remote_nodename);
 560             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 561             /* Do not free lrm_state->conn yet. */
 562             /* It'll be freed in the following stop action. */
 563             lrm_state_disconnect_only(lrm_state);
 564         }
 565         return;
 566     }
 567 
 568     /* filter all EXEC events up */
 569     if (op->type == lrmd_event_exec_complete) {
 570         if (ra_data->migrate_status == takeover_complete) {
 571             crm_debug("ignoring event, this connection is taken over by another node");
 572         } else {
 573             lrm_op_callback(op);
 574         }
 575         return;
 576     }
 577 
 578     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 579 
 580         if (ra_data->active == FALSE) {
 581             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 582                       lrm_state->node_name);
 583 
 584         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 585             crm_err("Lost connection to Pacemaker Remote node %s",
 586                     lrm_state->node_name);
 587             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 588             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 589 
 590         } else {
 591             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 592                        lrm_state->node_name);
 593             /* Do roughly what a 'stop' on the remote-resource would do */
 594             handle_remote_ra_stop(lrm_state, NULL);
 595             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 596             /* now fake the reply of a successful 'stop' */
 597             synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
 598         }
 599         return;
 600     }
 601 
 602     if (!ra_data->cur_cmd) {
 603         crm_debug("no event to match");
 604         return;
 605     }
 606 
 607     cmd = ra_data->cur_cmd;
 608 
 609     /* Start actions and migrate from actions complete after connection
 610      * comes back to us. */
 611     if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start",
 612                                                                "migrate_from", NULL)) {
 613         if (op->connection_rc < 0) {
 614             update_remaining_timeout(cmd);
 615 
 616             if ((op->connection_rc == -ENOKEY)
 617                 || (op->connection_rc == -EKEYREJECTED)) {
 618                 // Hard error, don't retry
 619                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 620                                  PCMK_EXEC_ERROR,
 621                                  pcmk_strerror(op->connection_rc));
 622 
 623             } else if (cmd->remaining_timeout > 3000) {
 624                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 625                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 626                 return;
 627 
 628             } else {
 629                 crm_trace("can't reschedule start, remaining timeout too small %d",
 630                           cmd->remaining_timeout);
 631                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 632                                     PCMK_EXEC_TIMEOUT,
 633                                     "%s without enough time to retry",
 634                                     pcmk_strerror(op->connection_rc));
 635             }
 636 
 637         } else {
 638             lrm_state_reset_tables(lrm_state, TRUE);
 639             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 640             ra_data->active = TRUE;
 641         }
 642 
 643         crm_debug("Remote connection event matched %s action", cmd->action);
 644         report_remote_ra_result(cmd);
 645         cmd_handled = TRUE;
 646 
 647     } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 648 
 649         if (cmd->monitor_timeout_id) {
 650             g_source_remove(cmd->monitor_timeout_id);
 651             cmd->monitor_timeout_id = 0;
 652         }
 653 
 654         /* Only report success the first time, after that only worry about failures.
 655          * For this function, if we get the poke pack, it is always a success. Pokes
 656          * only fail if the send fails, or the response times out. */
 657         if (!cmd->reported_success) {
 658             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 659             report_remote_ra_result(cmd);
 660             cmd->reported_success = 1;
 661         }
 662 
 663         crm_debug("Remote poke event matched %s action", cmd->action);
 664 
 665         /* success, keep rescheduling if interval is present. */
 666         if (cmd->interval_ms && (cmd->cancel == FALSE)) {
 667             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 668             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 669                                              recurring_helper, cmd);
 670             cmd = NULL;         /* prevent free */
 671         }
 672         cmd_handled = TRUE;
 673 
 674     } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 675         if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
 676             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 677                              PCMK_EXEC_ERROR,
 678                              "Remote connection unexpectedly dropped "
 679                              "during monitor");
 680             report_remote_ra_result(cmd);
 681             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 682                     lrm_state->node_name);
 683         }
 684         cmd_handled = TRUE;
 685 
 686     } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 687 
 688         handle_remote_ra_stop(lrm_state, cmd);
 689         cmd_handled = TRUE;
 690 
 691     } else {
 692         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 693     }
 694 
 695     if (cmd_handled) {
 696         ra_data->cur_cmd = NULL;
 697         if (ra_data->cmds) {
 698             mainloop_set_trigger(ra_data->work);
 699         }
 700         free_cmd(cmd);
 701     }
 702 }
 703 
 704 static void
 705 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 706 {
 707     remote_ra_data_t *ra_data = NULL;
 708 
 709     CRM_ASSERT(lrm_state);
 710     ra_data = lrm_state->remote_ra_data;
 711 
 712     if (ra_data->migrate_status != takeover_complete) {
 713         /* delete pending ops when ever the remote connection is intentionally stopped */
 714         g_hash_table_remove_all(lrm_state->pending_ops);
 715     } else {
 716         /* we no longer hold the history if this connection has been migrated,
 717          * however, we keep metadata cache for future use */
 718         lrm_state_reset_tables(lrm_state, FALSE);
 719     }
 720 
 721     ra_data->active = FALSE;
 722     lrm_state_disconnect(lrm_state);
 723 
 724     if (ra_data->cmds) {
 725         g_list_free_full(ra_data->cmds, free_cmd);
 726     }
 727     if (ra_data->recurring_cmds) {
 728         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 729     }
 730     ra_data->cmds = NULL;
 731     ra_data->recurring_cmds = NULL;
 732     ra_data->cur_cmd = NULL;
 733 
 734     if (cmd) {
 735         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 736         report_remote_ra_result(cmd);
 737     }
 738 }
 739 
 740 // \return Standard Pacemaker return code
 741 static int
 742 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 743 {
 744     const char *server = NULL;
 745     lrmd_key_value_t *tmp = NULL;
 746     int port = 0;
 747     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 748     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 749     int rc = pcmk_rc_ok;
 750 
 751     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 752         if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR,
 753                                  XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) {
 754             server = tmp->value;
 755         } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) {
 756             port = atoi(tmp->value);
 757         } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) {
 758             ra_data->controlling_guest = TRUE;
 759         }
 760     }
 761 
 762     rc = controld_connect_remote_executor(lrm_state, server, port,
 763                                           timeout_used);
 764     if (rc != pcmk_rc_ok) {
 765         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 766                             PCMK_EXEC_ERROR,
 767                             "Could not connect to Pacemaker Remote node %s: %s",
 768                             lrm_state->node_name, pcmk_rc_str(rc));
 769     }
 770     return rc;
 771 }
 772 
 773 static gboolean
 774 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 775 {
 776     int rc = 0;
 777     lrm_state_t *lrm_state = user_data;
 778     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 779     remote_ra_cmd_t *cmd;
 780     GList *first = NULL;
 781 
 782     if (ra_data->cur_cmd) {
 783         /* still waiting on previous cmd */
 784         return TRUE;
 785     }
 786 
 787     while (ra_data->cmds) {
 788         first = ra_data->cmds;
 789         cmd = first->data;
 790         if (cmd->delay_id) {
 791             /* still waiting for start delay timer to trip */
 792             return TRUE;
 793         }
 794 
 795         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 796         g_list_free_1(first);
 797 
 798         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
 799             ra_data->migrate_status = 0;
 800             if (handle_remote_ra_start(lrm_state, cmd,
 801                                        cmd->timeout) == pcmk_rc_ok) {
 802                 /* take care of this later when we get async connection result */
 803                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 804                           cmd->action);
 805                 ra_data->cur_cmd = cmd;
 806                 return TRUE;
 807             }
 808             report_remote_ra_result(cmd);
 809 
 810         } else if (!strcmp(cmd->action, "monitor")) {
 811 
 812             if (lrm_state_is_connected(lrm_state) == TRUE) {
 813                 rc = lrm_state_poke_connection(lrm_state);
 814                 if (rc < 0) {
 815                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 816                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 817                 }
 818             } else {
 819                 rc = -1;
 820                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 821                                  PCMK_EXEC_DONE, "Remote connection inactive");
 822             }
 823 
 824             if (rc == 0) {
 825                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 826                           cmd->rsc_id);
 827                 ra_data->cur_cmd = cmd;
 828                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 829                 return TRUE;
 830             }
 831             report_remote_ra_result(cmd);
 832 
 833         } else if (!strcmp(cmd->action, "stop")) {
 834 
 835             if (ra_data->migrate_status == expect_takeover) {
 836                 /* briefly wait on stop for the takeover event to occur. If the
 837                  * takeover event does not occur during the wait period, that's fine.
 838                  * It just means that the remote-node's lrm_status section is going to get
 839                  * cleared which will require all the resources running in the remote-node
 840                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 841                  * successfully, then we can leave the status section intact. */
 842                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 843                 ra_data->cur_cmd = cmd;
 844                 return TRUE;
 845             }
 846 
 847             handle_remote_ra_stop(lrm_state, cmd);
 848 
 849         } else if (!strcmp(cmd->action, "migrate_to")) {
 850             ra_data->migrate_status = expect_takeover;
 851             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 852             report_remote_ra_result(cmd);
 853         } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD,
 854                                     CRMD_ACTION_RELOAD_AGENT, NULL))  {
 855             /* Currently the only reloadable parameter is reconnect_interval,
 856              * which is only used by the scheduler via the CIB, so reloads are a
 857              * no-op.
 858              *
 859              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 860              * we're in a rolling upgrade with a DC scheduling "reload" instead
 861              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 862              * so this would work for that purpose as well.
 863              */
 864             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 865             report_remote_ra_result(cmd);
 866         }
 867 
 868         free_cmd(cmd);
 869     }
 870 
 871     return TRUE;
 872 }
 873 
 874 static void
 875 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 876 {
 877     remote_ra_data_t *ra_data = NULL;
 878 
 879     if (lrm_state->remote_ra_data) {
 880         return;
 881     }
 882 
 883     ra_data = calloc(1, sizeof(remote_ra_data_t));
 884     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
 885     lrm_state->remote_ra_data = ra_data;
 886 }
 887 
 888 void
 889 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 890 {
 891     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 892 
 893     if (!ra_data) {
 894         return;
 895     }
 896 
 897     if (ra_data->cmds) {
 898         g_list_free_full(ra_data->cmds, free_cmd);
 899     }
 900 
 901     if (ra_data->recurring_cmds) {
 902         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 903     }
 904     mainloop_destroy_trigger(ra_data->work);
 905     free(ra_data);
 906     lrm_state->remote_ra_data = NULL;
 907 }
 908 
 909 gboolean
 910 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
 911 {
 912     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
 913         return TRUE;
 914     }
 915     if (id && lrm_state_find(id) && !pcmk__str_eq(id, fsa_our_uname, pcmk__str_casei)) {
 916         return TRUE;
 917     }
 918 
 919     return FALSE;
 920 }
 921 
 922 lrmd_rsc_info_t *
 923 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 924 {
 925     lrmd_rsc_info_t *info = NULL;
 926 
 927     if ((lrm_state_find(rsc_id))) {
 928         info = calloc(1, sizeof(lrmd_rsc_info_t));
 929 
 930         info->id = strdup(rsc_id);
 931         info->type = strdup(REMOTE_LRMD_RA);
 932         info->standard = strdup(PCMK_RESOURCE_CLASS_OCF);
 933         info->provider = strdup("pacemaker");
 934     }
 935 
 936     return info;
 937 }
 938 
 939 static gboolean
 940 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 941 {
 942     return pcmk__str_any_of(action,
 943                             CRMD_ACTION_START,
 944                             CRMD_ACTION_STOP,
 945                             CRMD_ACTION_STATUS,
 946                             CRMD_ACTION_MIGRATE,
 947                             CRMD_ACTION_MIGRATED,
 948                             CRMD_ACTION_RELOAD_AGENT,
 949                             CRMD_ACTION_RELOAD,
 950                             NULL);
 951 }
 952 
 953 static GList *
 954 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
 955 {
 956     GList *rm_list = NULL;
 957     remote_ra_cmd_t *cmd = NULL;
 958     GList *gIter = NULL;
 959 
 960     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 961         cmd = gIter->data;
 962         if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 963             rm_list = g_list_append(rm_list, cmd);
 964         }
 965     }
 966 
 967     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
 968         cmd = gIter->data;
 969 
 970         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 971                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
 972         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
 973                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
 974         report_remote_ra_result(cmd);
 975 
 976         list = g_list_remove(list, cmd);
 977         free_cmd(cmd);
 978     }
 979 
 980     /* frees only the list data, not the cmds */
 981     g_list_free(rm_list);
 982     return list;
 983 }
 984 
 985 static GList *
 986 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 987 {
 988     remote_ra_cmd_t *cmd = NULL;
 989     GList *gIter = NULL;
 990 
 991     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 992         cmd = gIter->data;
 993         if ((cmd->interval_ms == interval_ms)
 994             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
 995             break;
 996         }
 997         cmd = NULL;
 998     }
 999     if (cmd) {
1000         list = g_list_remove(list, cmd);
1001         free_cmd(cmd);
1002     }
1003     return list;
1004 }
1005 
1006 int
1007 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1008                  const char *action, guint interval_ms)
1009 {
1010     lrm_state_t *connection_rsc = NULL;
1011     remote_ra_data_t *ra_data = NULL;
1012 
1013     connection_rsc = lrm_state_find(rsc_id);
1014     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1015         return -EINVAL;
1016     }
1017 
1018     ra_data = connection_rsc->remote_ra_data;
1019     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1020     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1021                                          interval_ms);
1022     if (ra_data->cur_cmd &&
1023         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1024         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1025 
1026         ra_data->cur_cmd->cancel = TRUE;
1027     }
1028 
1029     return 0;
1030 }
1031 
1032 static remote_ra_cmd_t *
1033 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1034                    const char *userdata)
1035 {
1036     GList *gIter = NULL;
1037     remote_ra_cmd_t *cmd = NULL;
1038 
1039     /* there are 3 places a potential duplicate monitor operation
1040      * could exist.
1041      * 1. recurring_cmds list. where the op is waiting for its next interval
1042      * 2. cmds list, where the op is queued to get executed immediately
1043      * 3. cur_cmd, which means the monitor op is in flight right now.
1044      */
1045     if (interval_ms == 0) {
1046         return NULL;
1047     }
1048 
1049     if (ra_data->cur_cmd &&
1050         ra_data->cur_cmd->cancel == FALSE &&
1051         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1052         pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) {
1053 
1054         cmd = ra_data->cur_cmd;
1055         goto handle_dup;
1056     }
1057 
1058     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1059         cmd = gIter->data;
1060         if ((cmd->interval_ms == interval_ms)
1061             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1062             goto handle_dup;
1063         }
1064     }
1065 
1066     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1067         cmd = gIter->data;
1068         if ((cmd->interval_ms == interval_ms)
1069             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1070             goto handle_dup;
1071         }
1072     }
1073 
1074     return NULL;
1075 
1076 handle_dup:
1077 
1078     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1079               cmd->rsc_id, "monitor", interval_ms);
1080 
1081     /* update the userdata */
1082     if (userdata) {
1083        free(cmd->userdata);
1084        cmd->userdata = strdup(userdata);
1085     }
1086 
1087     /* if we've already reported success, generate a new call id */
1088     if (cmd->reported_success) {
1089         cmd->start_time = time(NULL);
1090         cmd->call_id = generate_callid();
1091         cmd->reported_success = 0;
1092     }
1093 
1094     /* if we have an interval_id set, that means we are in the process of
1095      * waiting for this cmd's next interval. instead of waiting, cancel
1096      * the timer and execute the action immediately */
1097     if (cmd->interval_id) {
1098         g_source_remove(cmd->interval_id);
1099         cmd->interval_id = 0;
1100         recurring_helper(cmd);
1101     }
1102 
1103     return cmd;
1104 }
1105 
1106 /*!
1107  * \internal
1108  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1109  *
1110  * \param[in]     lrm_state      Executor state object for remote connection
1111  * \param[in]     rsc_id         Connection resource ID
1112  * \param[in]     action         Action to execute
1113  * \param[in]     userdata       String to copy and pass to execution callback
1114  * \param[in]     interval_ms    Action interval (in milliseconds)
1115  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1116  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1117  * \param[in,out] params         Connection resource parameters
1118  * \param[out]    call_id        Where to store call ID on success
1119  *
1120  * \return Standard Pacemaker return code
1121  * \note This takes ownership of \p params, which should not be used or freed
1122  *       after calling this function.
1123  */
1124 int
1125 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1126                               const char *action, const char *userdata,
1127                               guint interval_ms, int timeout_ms,
1128                               int start_delay_ms, lrmd_key_value_t *params,
1129                               int *call_id)
1130 {
1131     lrm_state_t *connection_rsc = NULL;
1132     remote_ra_cmd_t *cmd = NULL;
1133     remote_ra_data_t *ra_data = NULL;
1134 
1135     *call_id = 0;
1136 
1137     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1138               && (userdata != NULL) && (call_id != NULL),
1139               lrmd_key_value_freeall(params); return EINVAL);
1140 
1141     if (!is_remote_ra_supported_action(action)) {
1142         lrmd_key_value_freeall(params);
1143         return EOPNOTSUPP;
1144     }
1145 
1146     connection_rsc = lrm_state_find(rsc_id);
1147     if (connection_rsc == NULL) {
1148         lrmd_key_value_freeall(params);
1149         return ENOTCONN;
1150     }
1151 
1152     remote_ra_data_init(connection_rsc);
1153     ra_data = connection_rsc->remote_ra_data;
1154 
1155     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1156     if (cmd) {
1157         *call_id = cmd->call_id;
1158         lrmd_key_value_freeall(params);
1159         return pcmk_rc_ok;
1160     }
1161 
1162     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1163     if (cmd == NULL) {
1164         lrmd_key_value_freeall(params);
1165         return ENOMEM;
1166     }
1167 
1168     cmd->owner = strdup(lrm_state->node_name);
1169     cmd->rsc_id = strdup(rsc_id);
1170     cmd->action = strdup(action);
1171     cmd->userdata = strdup(userdata);
1172     if ((cmd->owner == NULL) || (cmd->rsc_id == NULL) || (cmd->action == NULL)
1173         || (cmd->userdata == NULL)) {
1174         free_cmd(cmd);
1175         lrmd_key_value_freeall(params);
1176         return ENOMEM;
1177     }
1178 
1179     cmd->interval_ms = interval_ms;
1180     cmd->timeout = timeout_ms;
1181     cmd->start_delay = start_delay_ms;
1182     cmd->params = params;
1183     cmd->start_time = time(NULL);
1184 
1185     cmd->call_id = generate_callid();
1186 
1187     if (cmd->start_delay) {
1188         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1189     }
1190 
1191     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1192     mainloop_set_trigger(ra_data->work);
1193 
1194     *call_id = cmd->call_id;
1195     return pcmk_rc_ok;
1196 }
1197 
1198 /*!
1199  * \internal
1200  * \brief Immediately fail all monitors of a remote node, if proxied here
1201  *
1202  * \param[in] node_name  Name of pacemaker_remote node
1203  */
1204 void
1205 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1206 {
1207     lrm_state_t *lrm_state = lrm_state_find(node_name);
1208 
1209     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1210         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1211 
1212         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1213         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1214         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1215     }
1216 }
1217 
1218 /* A guest node fencing implied by host fencing looks like:
1219  *
1220  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1221  *                on_node="lxc1" on_node_uuid="lxc1">
1222  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1223  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1224  *     <downed>
1225  *       <node id="lxc1"/>
1226  *     </downed>
1227  *  </pseudo_event>
1228  */
1229 #define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1230     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1231     "/" XML_CIB_TAG_NODE
1232 
1233 /*!
1234  * \internal
1235  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1236  *
1237  * \param[in,out] xml  XML of pseudo-action to check
1238  */
1239 void
1240 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1241 {
1242     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1243 
1244     if (numXpathResults(search) == 1) {
1245         xmlNode *result = getXpathResult(search, 0);
1246 
1247         /* Normally, we handle the necessary side effects of a guest node stop
1248          * action when reporting the remote agent's result. However, if the stop
1249          * is implied due to fencing, it will be a fencing pseudo-event, and
1250          * there won't be a result to report. Handle that case here.
1251          *
1252          * This will result in a duplicate call to remote_node_down() if the
1253          * guest stop was real instead of implied, but that shouldn't hurt.
1254          *
1255          * There is still one corner case that isn't handled: if a guest node
1256          * isn't running any resources when its host is fenced, it will appear
1257          * to be cleanly stopped, so there will be no pseudo-fence, and our
1258          * peer cache state will be incorrect unless and until the guest is
1259          * recovered.
1260          */
1261         if (result) {
1262             const char *remote = ID(result);
1263 
1264             if (remote) {
1265                 remote_node_down(remote, DOWN_ERASE_LRM);
1266             }
1267         }
1268     }
1269     freeXpathObject(search);
1270 }
1271 
1272 static void
1273 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1274 {
1275     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1276     xmlNode *update, *state;
1277     int call_opt, call_id = 0;
1278     crm_node_t *node;
1279 
1280     call_opt = crmd_cib_smart_opt();
1281     node = crm_remote_peer_get(lrm_state->node_name);
1282     CRM_CHECK(node != NULL, return);
1283     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1284     state = create_node_state_update(node, node_update_none, update,
1285                                      __func__);
1286     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1287     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
1288     if (call_id < 0) {
1289         crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
1290     } else {
1291         /* TODO: still not 100% sure that async update will succeed ... */
1292         ra_data->is_maintenance = maintenance;
1293     }
1294     free_xml(update);
1295 }
1296 
1297 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1298     "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1299     XML_GRAPH_TAG_MAINTENANCE
1300 
1301 /*!
1302  * \internal
1303  * \brief Check a pseudo-action holding updates for maintenance state
1304  *
1305  * \param[in,out] xml  XML of pseudo-action to check
1306  */
1307 void
1308 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1309 {
1310     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1311 
1312     if (numXpathResults(search) == 1) {
1313         xmlNode *node;
1314         int cnt = 0, cnt_remote = 0;
1315 
1316         for (node =
1317                 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1318             node != NULL; node = pcmk__xml_next(node)) {
1319             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1320 
1321             cnt++;
1322             if (lrm_state && lrm_state->remote_ra_data &&
1323                 ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
1324                 int is_maint;
1325 
1326                 cnt_remote++;
1327                 pcmk__scan_min_int(crm_element_value(node, XML_NODE_IS_MAINTENANCE),
1328                                    &is_maint, 0);
1329                 remote_ra_maintenance(lrm_state, is_maint);
1330             }
1331         }
1332         crm_trace("Action holds %d nodes (%d remotes found) "
1333                     "adjusting maintenance-mode", cnt, cnt_remote);
1334     }
1335     freeXpathObject(search);
1336 }
1337 
1338 gboolean
1339 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1340 {
1341     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1342 
1343     return ra_data->is_maintenance;
1344 }
1345 
1346 gboolean
1347 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1348 {
1349     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1350 
1351     return ra_data->controlling_guest;
1352 }

/* [previous][next][first][last][top][bottom][index][help] */