root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. remote_node_up
  6. remote_node_down
  7. check_remote_node_state
  8. report_remote_ra_result
  9. update_remaining_timeout
  10. retry_start_cmd_cb
  11. connection_takeover_timeout_cb
  12. monitor_timeout_cb
  13. synthesize_lrmd_success
  14. remote_lrm_op_callback
  15. handle_remote_ra_stop
  16. handle_remote_ra_start
  17. handle_remote_ra_exec
  18. remote_ra_data_init
  19. remote_ra_cleanup
  20. is_remote_lrmd_ra
  21. remote_ra_get_rsc_info
  22. is_remote_ra_supported_action
  23. fail_all_monitor_cmds
  24. remove_cmd
  25. remote_ra_cancel
  26. handle_dup_monitor
  27. controld_execute_remote_agent
  28. remote_ra_fail
  29. remote_ra_process_pseudo
  30. remote_ra_maintenance
  31. remote_ra_process_maintenance_nodes
  32. remote_ra_is_in_maintenance
  33. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2021 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 typedef struct remote_ra_cmd_s {
  27     /*! the local node the cmd is issued from */
  28     char *owner;
  29     /*! the remote node the cmd is executed on */
  30     char *rsc_id;
  31     /*! the action to execute */
  32     char *action;
  33     /*! some string the client wants us to give it back */
  34     char *userdata;
  35     /*! start delay in ms */
  36     int start_delay;
  37     /*! timer id used for start delay. */
  38     int delay_id;
  39     /*! timeout in ms for cmd */
  40     int timeout;
  41     int remaining_timeout;
  42     /*! recurring interval in ms */
  43     guint interval_ms;
  44     /*! interval timer id */
  45     int interval_id;
  46     int reported_success;
  47     int monitor_timeout_id;
  48     int takeover_timeout_id;
  49     /*! action parameters */
  50     lrmd_key_value_t *params;
  51     pcmk__action_result_t result;
  52     int call_id;
  53     time_t start_time;
  54     gboolean cancel;
  55 } remote_ra_cmd_t;
  56 
  57 enum remote_migration_status {
  58     expect_takeover = 1,
  59     takeover_complete,
  60 };
  61 
  62 typedef struct remote_ra_data_s {
  63     crm_trigger_t *work;
  64     remote_ra_cmd_t *cur_cmd;
  65     GList *cmds;
  66     GList *recurring_cmds;
  67 
  68     enum remote_migration_status migrate_status;
  69 
  70     gboolean active;
  71 
  72     /* Maintenance mode is difficult to determine from the controller's context,
  73      * so we have it signalled back with the transition from the scheduler.
  74      */
  75     gboolean is_maintenance;
  76 
  77     /* Similar for whether we are controlling a guest node or remote node.
  78      * Fortunately there is a meta-attribute in the transition already and
  79      * as the situation doesn't change over time we can use the
  80      * resource start for noting down the information for later use when
  81      * the attributes aren't at hand.
  82      */
  83     gboolean controlling_guest;
  84 } remote_ra_data_t;
  85 
  86 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
  87 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
  88 static GList *fail_all_monitor_cmds(GList * list);
  89 
  90 static void
  91 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  92 {
  93     remote_ra_cmd_t *cmd = user_data;
  94 
  95     if (!cmd) {
  96         return;
  97     }
  98     if (cmd->delay_id) {
  99         g_source_remove(cmd->delay_id);
 100     }
 101     if (cmd->interval_id) {
 102         g_source_remove(cmd->interval_id);
 103     }
 104     if (cmd->monitor_timeout_id) {
 105         g_source_remove(cmd->monitor_timeout_id);
 106     }
 107     if (cmd->takeover_timeout_id) {
 108         g_source_remove(cmd->takeover_timeout_id);
 109     }
 110     free(cmd->owner);
 111     free(cmd->rsc_id);
 112     free(cmd->action);
 113     free(cmd->userdata);
 114     pcmk__reset_result(&(cmd->result));
 115     lrmd_key_value_freeall(cmd->params);
 116     free(cmd);
 117 }
 118 
 119 static int
 120 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 121 {
 122     static int remote_ra_callid = 0;
 123 
 124     remote_ra_callid++;
 125     if (remote_ra_callid <= 0) {
 126         remote_ra_callid = 1;
 127     }
 128 
 129     return remote_ra_callid;
 130 }
 131 
 132 static gboolean
 133 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 134 {
 135     remote_ra_cmd_t *cmd = data;
 136     lrm_state_t *connection_rsc = NULL;
 137 
 138     cmd->interval_id = 0;
 139     connection_rsc = lrm_state_find(cmd->rsc_id);
 140     if (connection_rsc && connection_rsc->remote_ra_data) {
 141         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 142 
 143         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 144 
 145         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 146         mainloop_set_trigger(ra_data->work);
 147     }
 148     return FALSE;
 149 }
 150 
 151 static gboolean
 152 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 153 {
 154     remote_ra_cmd_t *cmd = data;
 155     lrm_state_t *connection_rsc = NULL;
 156 
 157     cmd->delay_id = 0;
 158     connection_rsc = lrm_state_find(cmd->rsc_id);
 159     if (connection_rsc && connection_rsc->remote_ra_data) {
 160         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 161 
 162         mainloop_set_trigger(ra_data->work);
 163     }
 164     return FALSE;
 165 }
 166 
 167 /*!
 168  * \internal
 169  * \brief Handle cluster communication related to pacemaker_remote node joining
 170  *
 171  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 172  */
 173 static void
 174 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 175 {
 176     int call_opt, call_id = 0;
 177     xmlNode *update, *state;
 178     crm_node_t *node;
 179     enum controld_section_e section = controld_section_all;
 180 
 181     CRM_CHECK(node_name != NULL, return);
 182     crm_info("Announcing Pacemaker Remote node %s", node_name);
 183 
 184     /* Clear node's entire state (resource history and transient attributes)
 185      * other than shutdown locks. The transient attributes should and normally
 186      * will be cleared when the node leaves, but since remote node state has a
 187      * number of corner cases, clear them here as well, to be sure.
 188      */
 189     call_opt = crmd_cib_smart_opt();
 190     if (controld_shutdown_lock_enabled) {
 191         section = controld_section_all_unlocked;
 192     }
 193     /* Purge node from attrd's memory */
 194     update_attrd_remote_node_removed(node_name, NULL);
 195 
 196     controld_delete_node_state(node_name, section, call_opt);
 197 
 198     /* Clear node's probed attribute */
 199     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 200 
 201     /* Ensure node is in the remote peer cache with member status */
 202     node = crm_remote_peer_get(node_name);
 203     CRM_CHECK(node != NULL, return);
 204     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 205 
 206     /* pacemaker_remote nodes don't participate in the membership layer,
 207      * so cluster nodes don't automatically get notified when they come and go.
 208      * We send a cluster message to the DC, and update the CIB node state entry,
 209      * so the DC will get it sooner (via message) or later (via CIB refresh),
 210      * and any other interested parties can query the CIB.
 211      */
 212     send_remote_state_message(node_name, TRUE);
 213 
 214     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 215     state = create_node_state_update(node, node_update_cluster, update,
 216                                      __func__);
 217 
 218     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 219      * needs to be fenced, this flag will allow various actions to determine
 220      * whether the fencing has happened yet.
 221      */
 222     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 223 
 224     /* TODO: If the remote connection drops, and this (async) CIB update either
 225      * failed or has not yet completed, later actions could mistakenly think the
 226      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 227      * previously set, because it won't have been cleared). This could prevent
 228      * actual fencing or allow recurring monitor failures to be cleared too
 229      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 230      */
 231     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 232     if (call_id < 0) {
 233         crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
 234     }
 235     free_xml(update);
 236 }
 237 
 238 enum down_opts {
 239     DOWN_KEEP_LRM,
 240     DOWN_ERASE_LRM
 241 };
 242 
 243 /*!
 244  * \internal
 245  * \brief Handle cluster communication related to pacemaker_remote node leaving
 246  *
 247  * \param[in] node_name  Name of lost node
 248  * \param[in] opts       Whether to keep or erase LRM history
 249  */
 250 static void
 251 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 252 {
 253     xmlNode *update;
 254     int call_id = 0;
 255     int call_opt = crmd_cib_smart_opt();
 256     crm_node_t *node;
 257 
 258     /* Purge node from attrd's memory */
 259     update_attrd_remote_node_removed(node_name, NULL);
 260 
 261     /* Normally, only node attributes should be erased, and the resource history
 262      * should be kept until the node comes back up. However, after a successful
 263      * fence, we want to clear the history as well, so we don't think resources
 264      * are still running on the node.
 265      */
 266     if (opts == DOWN_ERASE_LRM) {
 267         controld_delete_node_state(node_name, controld_section_all, call_opt);
 268     } else {
 269         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 270     }
 271 
 272     /* Ensure node is in the remote peer cache with lost state */
 273     node = crm_remote_peer_get(node_name);
 274     CRM_CHECK(node != NULL, return);
 275     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 276 
 277     /* Notify DC */
 278     send_remote_state_message(node_name, FALSE);
 279 
 280     /* Update CIB node state */
 281     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 282     create_node_state_update(node, node_update_cluster, update, __func__);
 283     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 284     if (call_id < 0) {
 285         crm_perror(LOG_ERR, "%s CIB node state update", node_name);
 286     }
 287     free_xml(update);
 288 }
 289 
 290 /*!
 291  * \internal
 292  * \brief Handle effects of a remote RA command on node state
 293  *
 294  * \param[in] cmd  Completed remote RA command
 295  */
 296 static void
 297 check_remote_node_state(remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 298 {
 299     /* Only successful actions can change node state */
 300     if (cmd->result.exit_status != PCMK_OCF_OK) {
 301         return;
 302     }
 303 
 304     if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
 305         remote_node_up(cmd->rsc_id);
 306 
 307     } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) {
 308         /* After a successful migration, we don't need to do remote_node_up()
 309          * because the DC already knows the node is up, and we don't want to
 310          * clear LRM history etc. We do need to add the remote node to this
 311          * host's remote peer cache, because (unless it happens to be DC)
 312          * it hasn't been tracking the remote node, and other code relies on
 313          * the cache to distinguish remote nodes from unseen cluster nodes.
 314          */
 315         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 316 
 317         CRM_CHECK(node != NULL, return);
 318         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 319 
 320     } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 321         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 322         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 323 
 324         if (ra_data) {
 325             if (ra_data->migrate_status != takeover_complete) {
 326                 /* Stop means down if we didn't successfully migrate elsewhere */
 327                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 328             } else if (AM_I_DC == FALSE) {
 329                 /* Only the connection host and DC track node state,
 330                  * so if the connection migrated elsewhere and we aren't DC,
 331                  * un-cache the node, so we don't have stale info
 332                  */
 333                 crm_remote_peer_cache_remove(cmd->rsc_id);
 334             }
 335         }
 336     }
 337 
 338     /* We don't do anything for successful monitors, which is correct for
 339      * routine recurring monitors, and for monitors on nodes where the
 340      * connection isn't supposed to be (the cluster will stop the connection in
 341      * that case). However, if the initial probe finds the connection already
 342      * active on the node where we want it, we probably should do
 343      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 344      * Given that connections have to be initiated by the cluster, the chance of
 345      * that should be close to zero.
 346      */
 347 }
 348 
 349 static void
 350 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 351 {
 352     lrmd_event_data_t op = { 0, };
 353 
 354     check_remote_node_state(cmd);
 355 
 356     op.type = lrmd_event_exec_complete;
 357     op.rsc_id = cmd->rsc_id;
 358     op.op_type = cmd->action;
 359     op.user_data = cmd->userdata;
 360     op.timeout = cmd->timeout;
 361     op.interval_ms = cmd->interval_ms;
 362     op.t_run = (unsigned int) cmd->start_time;
 363     op.t_rcchange = (unsigned int) cmd->start_time;
 364 
 365     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 366                      cmd->result.exit_reason);
 367 
 368     if (cmd->reported_success && (cmd->result.exit_status != PCMK_OCF_OK)) {
 369         op.t_rcchange = (unsigned int) time(NULL);
 370         /* This edge case will likely never ever occur, but if it does the
 371          * result is that a failure will not be processed correctly. This is only
 372          * remotely possible because we are able to detect a connection resource's tcp
 373          * connection has failed at any moment after start has completed. The actual
 374          * recurring operation is just a connectivity ping.
 375          *
 376          * basically, we are not guaranteed that the first successful monitor op and
 377          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 378          * make it look like the operations occurred at separate times though. */
 379         if (op.t_rcchange == op.t_run) {
 380             op.t_rcchange++;
 381         }
 382     }
 383 
 384     if (cmd->params) {
 385         lrmd_key_value_t *tmp;
 386 
 387         op.params = pcmk__strkey_table(free, free);
 388         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 389             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 390         }
 391 
 392     }
 393     op.call_id = cmd->call_id;
 394     op.remote_nodename = cmd->owner;
 395 
 396     lrm_op_callback(&op);
 397 
 398     if (op.params) {
 399         g_hash_table_destroy(op.params);
 400     }
 401     lrmd__reset_result(&op);
 402 }
 403 
 404 static void
 405 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 406 {
 407     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 408 }
 409 
 410 static gboolean
 411 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 412 {
 413     lrm_state_t *lrm_state = data;
 414     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 415     remote_ra_cmd_t *cmd = NULL;
 416     int rc = ETIME;
 417 
 418     if (!ra_data || !ra_data->cur_cmd) {
 419         return FALSE;
 420     }
 421     cmd = ra_data->cur_cmd;
 422     if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) {
 423         return FALSE;
 424     }
 425     update_remaining_timeout(cmd);
 426 
 427     if (cmd->remaining_timeout > 0) {
 428         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 429     } else {
 430         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 431                          PCMK_EXEC_TIMEOUT,
 432                          "Not enough time remains to retry remote connection");
 433     }
 434 
 435     if (rc != pcmk_rc_ok) {
 436         report_remote_ra_result(cmd);
 437 
 438         if (ra_data->cmds) {
 439             mainloop_set_trigger(ra_data->work);
 440         }
 441         ra_data->cur_cmd = NULL;
 442         free_cmd(cmd);
 443     } else {
 444         /* wait for connection event */
 445     }
 446 
 447     return FALSE;
 448 }
 449 
 450 
 451 static gboolean
 452 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 453 {
 454     lrm_state_t *lrm_state = NULL;
 455     remote_ra_cmd_t *cmd = data;
 456 
 457     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 458     cmd->takeover_timeout_id = 0;
 459 
 460     lrm_state = lrm_state_find(cmd->rsc_id);
 461 
 462     handle_remote_ra_stop(lrm_state, cmd);
 463     free_cmd(cmd);
 464 
 465     return FALSE;
 466 }
 467 
 468 static gboolean
 469 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 470 {
 471     lrm_state_t *lrm_state = NULL;
 472     remote_ra_cmd_t *cmd = data;
 473 
 474     lrm_state = lrm_state_find(cmd->rsc_id);
 475 
 476     crm_info("Timed out waiting for remote poke response from %s%s",
 477              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 478     cmd->monitor_timeout_id = 0;
 479     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 480                      "Remote executor did not respond");
 481 
 482     if (lrm_state && lrm_state->remote_ra_data) {
 483         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 484 
 485         if (ra_data->cur_cmd == cmd) {
 486             ra_data->cur_cmd = NULL;
 487         }
 488         if (ra_data->cmds) {
 489             mainloop_set_trigger(ra_data->work);
 490         }
 491     }
 492 
 493     report_remote_ra_result(cmd);
 494     free_cmd(cmd);
 495 
 496     if(lrm_state) {
 497         lrm_state_disconnect(lrm_state);
 498     }
 499     return FALSE;
 500 }
 501 
 502 static void
 503 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 504 {
 505     lrmd_event_data_t op = { 0, };
 506 
 507     if (lrm_state == NULL) {
 508         /* if lrm_state not given assume local */
 509         lrm_state = lrm_state_find(fsa_our_uname);
 510     }
 511     CRM_ASSERT(lrm_state != NULL);
 512 
 513     op.type = lrmd_event_exec_complete;
 514     op.rsc_id = rsc_id;
 515     op.op_type = op_type;
 516     op.t_run = (unsigned int) time(NULL);
 517     op.t_rcchange = op.t_run;
 518     op.call_id = generate_callid();
 519     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 520     process_lrm_event(lrm_state, &op, NULL, NULL);
 521 }
 522 
 523 void
 524 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 525 {
 526     gboolean cmd_handled = FALSE;
 527     lrm_state_t *lrm_state = NULL;
 528     remote_ra_data_t *ra_data = NULL;
 529     remote_ra_cmd_t *cmd = NULL;
 530 
 531     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 532               "(%d) status=%s (%d)",
 533               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 534               lrmd_event_type2str(op->type), op->remote_nodename,
 535               services_ocf_exitcode_str(op->rc), op->rc,
 536               pcmk_exec_status_str(op->op_status), op->op_status);
 537 
 538     lrm_state = lrm_state_find(op->remote_nodename);
 539     if (!lrm_state || !lrm_state->remote_ra_data) {
 540         crm_debug("No state information found for remote connection event");
 541         return;
 542     }
 543     ra_data = lrm_state->remote_ra_data;
 544 
 545     if (op->type == lrmd_event_new_client) {
 546         // Another client has connected to the remote daemon
 547 
 548         if (ra_data->migrate_status == expect_takeover) {
 549             // Great, we knew this was coming
 550             ra_data->migrate_status = takeover_complete;
 551 
 552         } else {
 553             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 554                     "unexpected client takeover", op->remote_nodename);
 555             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 556             /* Do not free lrm_state->conn yet. */
 557             /* It'll be freed in the following stop action. */
 558             lrm_state_disconnect_only(lrm_state);
 559         }
 560         return;
 561     }
 562 
 563     /* filter all EXEC events up */
 564     if (op->type == lrmd_event_exec_complete) {
 565         if (ra_data->migrate_status == takeover_complete) {
 566             crm_debug("ignoring event, this connection is taken over by another node");
 567         } else {
 568             lrm_op_callback(op);
 569         }
 570         return;
 571     }
 572 
 573     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 574 
 575         if (ra_data->active == FALSE) {
 576             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 577                       lrm_state->node_name);
 578 
 579         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 580             crm_err("Lost connection to Pacemaker Remote node %s",
 581                     lrm_state->node_name);
 582             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 583             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 584 
 585         } else {
 586             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 587                        lrm_state->node_name);
 588             /* Do roughly what a 'stop' on the remote-resource would do */
 589             handle_remote_ra_stop(lrm_state, NULL);
 590             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 591             /* now fake the reply of a successful 'stop' */
 592             synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
 593         }
 594         return;
 595     }
 596 
 597     if (!ra_data->cur_cmd) {
 598         crm_debug("no event to match");
 599         return;
 600     }
 601 
 602     cmd = ra_data->cur_cmd;
 603 
 604     /* Start actions and migrate from actions complete after connection
 605      * comes back to us. */
 606     if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start",
 607                                                                "migrate_from", NULL)) {
 608         if (op->connection_rc < 0) {
 609             update_remaining_timeout(cmd);
 610 
 611             if (op->connection_rc == -ENOKEY) {
 612                 // Hard error, don't retry
 613                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 614                                  PCMK_EXEC_ERROR,
 615                                  "Authentication key not readable");
 616 
 617             } else if (cmd->remaining_timeout > 3000) {
 618                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 619                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 620                 return;
 621 
 622             } else {
 623                 crm_trace("can't reschedule start, remaining timeout too small %d",
 624                           cmd->remaining_timeout);
 625                 pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 626                                  PCMK_EXEC_TIMEOUT,
 627                                  pcmk_strerror(op->connection_rc));
 628             }
 629 
 630         } else {
 631             lrm_state_reset_tables(lrm_state, TRUE);
 632             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 633             ra_data->active = TRUE;
 634         }
 635 
 636         crm_debug("Remote connection event matched %s action", cmd->action);
 637         report_remote_ra_result(cmd);
 638         cmd_handled = TRUE;
 639 
 640     } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 641 
 642         if (cmd->monitor_timeout_id) {
 643             g_source_remove(cmd->monitor_timeout_id);
 644             cmd->monitor_timeout_id = 0;
 645         }
 646 
 647         /* Only report success the first time, after that only worry about failures.
 648          * For this function, if we get the poke pack, it is always a success. Pokes
 649          * only fail if the send fails, or the response times out. */
 650         if (!cmd->reported_success) {
 651             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 652             report_remote_ra_result(cmd);
 653             cmd->reported_success = 1;
 654         }
 655 
 656         crm_debug("Remote poke event matched %s action", cmd->action);
 657 
 658         /* success, keep rescheduling if interval is present. */
 659         if (cmd->interval_ms && (cmd->cancel == FALSE)) {
 660             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 661             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 662                                              recurring_helper, cmd);
 663             cmd = NULL;         /* prevent free */
 664         }
 665         cmd_handled = TRUE;
 666 
 667     } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 668         if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
 669             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 670                              PCMK_EXEC_ERROR,
 671                              "Remote connection unexpectedly dropped "
 672                              "during monitor");
 673             report_remote_ra_result(cmd);
 674             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 675                     lrm_state->node_name);
 676         }
 677         cmd_handled = TRUE;
 678 
 679     } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 680 
 681         handle_remote_ra_stop(lrm_state, cmd);
 682         cmd_handled = TRUE;
 683 
 684     } else {
 685         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 686     }
 687 
 688     if (cmd_handled) {
 689         ra_data->cur_cmd = NULL;
 690         if (ra_data->cmds) {
 691             mainloop_set_trigger(ra_data->work);
 692         }
 693         free_cmd(cmd);
 694     }
 695 }
 696 
 697 static void
 698 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 699 {
 700     remote_ra_data_t *ra_data = NULL;
 701 
 702     CRM_ASSERT(lrm_state);
 703     ra_data = lrm_state->remote_ra_data;
 704 
 705     if (ra_data->migrate_status != takeover_complete) {
 706         /* delete pending ops when ever the remote connection is intentionally stopped */
 707         g_hash_table_remove_all(lrm_state->pending_ops);
 708     } else {
 709         /* we no longer hold the history if this connection has been migrated,
 710          * however, we keep metadata cache for future use */
 711         lrm_state_reset_tables(lrm_state, FALSE);
 712     }
 713 
 714     ra_data->active = FALSE;
 715     lrm_state_disconnect(lrm_state);
 716 
 717     if (ra_data->cmds) {
 718         g_list_free_full(ra_data->cmds, free_cmd);
 719     }
 720     if (ra_data->recurring_cmds) {
 721         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 722     }
 723     ra_data->cmds = NULL;
 724     ra_data->recurring_cmds = NULL;
 725     ra_data->cur_cmd = NULL;
 726 
 727     if (cmd) {
 728         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 729         report_remote_ra_result(cmd);
 730     }
 731 }
 732 
 733 // \return Standard Pacemaker return code
 734 static int
 735 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 736 {
 737     const char *server = NULL;
 738     lrmd_key_value_t *tmp = NULL;
 739     int port = 0;
 740     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 741     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 742     int rc = pcmk_rc_ok;
 743 
 744     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 745         if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR,
 746                                  XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) {
 747             server = tmp->value;
 748         } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) {
 749             port = atoi(tmp->value);
 750         } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) {
 751             ra_data->controlling_guest = TRUE;
 752         }
 753     }
 754 
 755     rc = controld_connect_remote_executor(lrm_state, server, port,
 756                                           timeout_used);
 757     if (rc != pcmk_rc_ok) {
 758         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 759                          PCMK_EXEC_ERROR, pcmk_rc_str(rc));
 760     }
 761     return rc;
 762 }
 763 
 764 static gboolean
 765 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 766 {
 767     int rc = 0;
 768     lrm_state_t *lrm_state = user_data;
 769     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 770     remote_ra_cmd_t *cmd;
 771     GList *first = NULL;
 772 
 773     if (ra_data->cur_cmd) {
 774         /* still waiting on previous cmd */
 775         return TRUE;
 776     }
 777 
 778     while (ra_data->cmds) {
 779         first = ra_data->cmds;
 780         cmd = first->data;
 781         if (cmd->delay_id) {
 782             /* still waiting for start delay timer to trip */
 783             return TRUE;
 784         }
 785 
 786         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 787         g_list_free_1(first);
 788 
 789         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
 790             ra_data->migrate_status = 0;
 791             if (handle_remote_ra_start(lrm_state, cmd,
 792                                        cmd->timeout) == pcmk_rc_ok) {
 793                 /* take care of this later when we get async connection result */
 794                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 795                           cmd->action);
 796                 ra_data->cur_cmd = cmd;
 797                 return TRUE;
 798             }
 799             report_remote_ra_result(cmd);
 800 
 801         } else if (!strcmp(cmd->action, "monitor")) {
 802 
 803             if (lrm_state_is_connected(lrm_state) == TRUE) {
 804                 rc = lrm_state_poke_connection(lrm_state);
 805                 if (rc < 0) {
 806                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 807                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 808                 }
 809             } else {
 810                 rc = -1;
 811                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 812                                  PCMK_EXEC_DONE, "Remote connection inactive");
 813             }
 814 
 815             if (rc == 0) {
 816                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 817                           cmd->rsc_id);
 818                 ra_data->cur_cmd = cmd;
 819                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 820                 return TRUE;
 821             }
 822             report_remote_ra_result(cmd);
 823 
 824         } else if (!strcmp(cmd->action, "stop")) {
 825 
 826             if (ra_data->migrate_status == expect_takeover) {
 827                 /* briefly wait on stop for the takeover event to occur. If the
 828                  * takeover event does not occur during the wait period, that's fine.
 829                  * It just means that the remote-node's lrm_status section is going to get
 830                  * cleared which will require all the resources running in the remote-node
 831                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 832                  * successfully, then we can leave the status section intact. */
 833                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 834                 ra_data->cur_cmd = cmd;
 835                 return TRUE;
 836             }
 837 
 838             handle_remote_ra_stop(lrm_state, cmd);
 839 
 840         } else if (!strcmp(cmd->action, "migrate_to")) {
 841             ra_data->migrate_status = expect_takeover;
 842             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 843             report_remote_ra_result(cmd);
 844         } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD,
 845                                     CRMD_ACTION_RELOAD_AGENT, NULL))  {
 846             /* Currently the only reloadable parameter is reconnect_interval,
 847              * which is only used by the scheduler via the CIB, so reloads are a
 848              * no-op.
 849              *
 850              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 851              * we're in a rolling upgrade with a DC scheduling "reload" instead
 852              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 853              * so this would work for that purpose as well.
 854              */
 855             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 856             report_remote_ra_result(cmd);
 857         }
 858 
 859         free_cmd(cmd);
 860     }
 861 
 862     return TRUE;
 863 }
 864 
 865 static void
 866 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 867 {
 868     remote_ra_data_t *ra_data = NULL;
 869 
 870     if (lrm_state->remote_ra_data) {
 871         return;
 872     }
 873 
 874     ra_data = calloc(1, sizeof(remote_ra_data_t));
 875     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
 876     lrm_state->remote_ra_data = ra_data;
 877 }
 878 
 879 void
 880 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 881 {
 882     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 883 
 884     if (!ra_data) {
 885         return;
 886     }
 887 
 888     if (ra_data->cmds) {
 889         g_list_free_full(ra_data->cmds, free_cmd);
 890     }
 891 
 892     if (ra_data->recurring_cmds) {
 893         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 894     }
 895     mainloop_destroy_trigger(ra_data->work);
 896     free(ra_data);
 897     lrm_state->remote_ra_data = NULL;
 898 }
 899 
 900 gboolean
 901 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
 902 {
 903     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
 904         return TRUE;
 905     }
 906     if (id && lrm_state_find(id) && !pcmk__str_eq(id, fsa_our_uname, pcmk__str_casei)) {
 907         return TRUE;
 908     }
 909 
 910     return FALSE;
 911 }
 912 
 913 lrmd_rsc_info_t *
 914 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 915 {
 916     lrmd_rsc_info_t *info = NULL;
 917 
 918     if ((lrm_state_find(rsc_id))) {
 919         info = calloc(1, sizeof(lrmd_rsc_info_t));
 920 
 921         info->id = strdup(rsc_id);
 922         info->type = strdup(REMOTE_LRMD_RA);
 923         info->standard = strdup(PCMK_RESOURCE_CLASS_OCF);
 924         info->provider = strdup("pacemaker");
 925     }
 926 
 927     return info;
 928 }
 929 
 930 static gboolean
 931 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 932 {
 933     return pcmk__str_any_of(action,
 934                             CRMD_ACTION_START,
 935                             CRMD_ACTION_STOP,
 936                             CRMD_ACTION_STATUS,
 937                             CRMD_ACTION_MIGRATE,
 938                             CRMD_ACTION_MIGRATED,
 939                             CRMD_ACTION_RELOAD_AGENT,
 940                             CRMD_ACTION_RELOAD,
 941                             NULL);
 942 }
 943 
 944 static GList *
 945 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
 946 {
 947     GList *rm_list = NULL;
 948     remote_ra_cmd_t *cmd = NULL;
 949     GList *gIter = NULL;
 950 
 951     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 952         cmd = gIter->data;
 953         if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 954             rm_list = g_list_append(rm_list, cmd);
 955         }
 956     }
 957 
 958     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
 959         cmd = gIter->data;
 960 
 961         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 962                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
 963         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
 964                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
 965         report_remote_ra_result(cmd);
 966 
 967         list = g_list_remove(list, cmd);
 968         free_cmd(cmd);
 969     }
 970 
 971     /* frees only the list data, not the cmds */
 972     g_list_free(rm_list);
 973     return list;
 974 }
 975 
 976 static GList *
 977 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 978 {
 979     remote_ra_cmd_t *cmd = NULL;
 980     GList *gIter = NULL;
 981 
 982     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 983         cmd = gIter->data;
 984         if ((cmd->interval_ms == interval_ms)
 985             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
 986             break;
 987         }
 988         cmd = NULL;
 989     }
 990     if (cmd) {
 991         list = g_list_remove(list, cmd);
 992         free_cmd(cmd);
 993     }
 994     return list;
 995 }
 996 
 997 int
 998 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
 999                  const char *action, guint interval_ms)
1000 {
1001     lrm_state_t *connection_rsc = NULL;
1002     remote_ra_data_t *ra_data = NULL;
1003 
1004     connection_rsc = lrm_state_find(rsc_id);
1005     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1006         return -EINVAL;
1007     }
1008 
1009     ra_data = connection_rsc->remote_ra_data;
1010     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1011     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1012                                          interval_ms);
1013     if (ra_data->cur_cmd &&
1014         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1015         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1016 
1017         ra_data->cur_cmd->cancel = TRUE;
1018     }
1019 
1020     return 0;
1021 }
1022 
1023 static remote_ra_cmd_t *
1024 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1025                    const char *userdata)
1026 {
1027     GList *gIter = NULL;
1028     remote_ra_cmd_t *cmd = NULL;
1029 
1030     /* there are 3 places a potential duplicate monitor operation
1031      * could exist.
1032      * 1. recurring_cmds list. where the op is waiting for its next interval
1033      * 2. cmds list, where the op is queued to get executed immediately
1034      * 3. cur_cmd, which means the monitor op is in flight right now.
1035      */
1036     if (interval_ms == 0) {
1037         return NULL;
1038     }
1039 
1040     if (ra_data->cur_cmd &&
1041         ra_data->cur_cmd->cancel == FALSE &&
1042         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1043         pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) {
1044 
1045         cmd = ra_data->cur_cmd;
1046         goto handle_dup;
1047     }
1048 
1049     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1050         cmd = gIter->data;
1051         if ((cmd->interval_ms == interval_ms)
1052             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1053             goto handle_dup;
1054         }
1055     }
1056 
1057     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1058         cmd = gIter->data;
1059         if ((cmd->interval_ms == interval_ms)
1060             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1061             goto handle_dup;
1062         }
1063     }
1064 
1065     return NULL;
1066 
1067 handle_dup:
1068 
1069     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1070               cmd->rsc_id, "monitor", interval_ms);
1071 
1072     /* update the userdata */
1073     if (userdata) {
1074        free(cmd->userdata);
1075        cmd->userdata = strdup(userdata);
1076     }
1077 
1078     /* if we've already reported success, generate a new call id */
1079     if (cmd->reported_success) {
1080         cmd->start_time = time(NULL);
1081         cmd->call_id = generate_callid();
1082         cmd->reported_success = 0;
1083     }
1084 
1085     /* if we have an interval_id set, that means we are in the process of
1086      * waiting for this cmd's next interval. instead of waiting, cancel
1087      * the timer and execute the action immediately */
1088     if (cmd->interval_id) {
1089         g_source_remove(cmd->interval_id);
1090         cmd->interval_id = 0;
1091         recurring_helper(cmd);
1092     }
1093 
1094     return cmd;
1095 }
1096 
1097 /*!
1098  * \internal
1099  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1100  *
1101  * \param[in]  lrm_state       Executor state object for remote connection
1102  * \param[in]  rsc_id          Connection resource ID
1103  * \param[in]  action          Action to execute
1104  * \param[in]  userdata        String to copy and pass to execution callback
1105  * \param[in]  interval_ms     Action interval (in milliseconds)
1106  * \param[in]  timeout_ms      Action timeout (in milliseconds)
1107  * \param[in]  start_delay_ms  Delay (in milliseconds) before initiating action
1108  * \param[in]  params          Connection resource parameters
1109  * \param[out] call_id         Where to store call ID on success
1110  *
1111  * \return Standard Pacemaker return code
1112  * \note This takes ownership of \p params, which should not be used or freed
1113  *       after calling this function.
1114  */
1115 int
1116 controld_execute_remote_agent(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1117                               const char *action, const char *userdata,
1118                               guint interval_ms, int timeout_ms,
1119                               int start_delay_ms, lrmd_key_value_t *params,
1120                               int *call_id)
1121 {
1122     lrm_state_t *connection_rsc = NULL;
1123     remote_ra_cmd_t *cmd = NULL;
1124     remote_ra_data_t *ra_data = NULL;
1125 
1126     *call_id = 0;
1127 
1128     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1129               && (userdata != NULL) && (call_id != NULL),
1130               lrmd_key_value_freeall(params); return EINVAL);
1131 
1132     if (!is_remote_ra_supported_action(action)) {
1133         lrmd_key_value_freeall(params);
1134         return EOPNOTSUPP;
1135     }
1136 
1137     connection_rsc = lrm_state_find(rsc_id);
1138     if (connection_rsc == NULL) {
1139         lrmd_key_value_freeall(params);
1140         return ENOTCONN;
1141     }
1142 
1143     remote_ra_data_init(connection_rsc);
1144     ra_data = connection_rsc->remote_ra_data;
1145 
1146     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1147     if (cmd) {
1148         *call_id = cmd->call_id;
1149         lrmd_key_value_freeall(params);
1150         return pcmk_rc_ok;
1151     }
1152 
1153     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1154     if (cmd == NULL) {
1155         lrmd_key_value_freeall(params);
1156         return ENOMEM;
1157     }
1158 
1159     cmd->owner = strdup(lrm_state->node_name);
1160     cmd->rsc_id = strdup(rsc_id);
1161     cmd->action = strdup(action);
1162     cmd->userdata = strdup(userdata);
1163     if ((cmd->owner == NULL) || (cmd->rsc_id == NULL) || (cmd->action == NULL)
1164         || (cmd->userdata == NULL)) {
1165         free_cmd(cmd);
1166         lrmd_key_value_freeall(params);
1167         return ENOMEM;
1168     }
1169 
1170     cmd->interval_ms = interval_ms;
1171     cmd->timeout = timeout_ms;
1172     cmd->start_delay = start_delay_ms;
1173     cmd->params = params;
1174     cmd->start_time = time(NULL);
1175 
1176     cmd->call_id = generate_callid();
1177 
1178     if (cmd->start_delay) {
1179         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1180     }
1181 
1182     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1183     mainloop_set_trigger(ra_data->work);
1184 
1185     *call_id = cmd->call_id;
1186     return pcmk_rc_ok;
1187 }
1188 
1189 /*!
1190  * \internal
1191  * \brief Immediately fail all monitors of a remote node, if proxied here
1192  *
1193  * \param[in] node_name  Name of pacemaker_remote node
1194  */
1195 void
1196 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1197 {
1198     lrm_state_t *lrm_state = lrm_state_find(node_name);
1199 
1200     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1201         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1202 
1203         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1204         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1205         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1206     }
1207 }
1208 
1209 /* A guest node fencing implied by host fencing looks like:
1210  *
1211  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1212  *                on_node="lxc1" on_node_uuid="lxc1">
1213  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1214  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1215  *     <downed>
1216  *       <node id="lxc1"/>
1217  *     </downed>
1218  *  </pseudo_event>
1219  */
1220 #define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1221     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1222     "/" XML_CIB_TAG_NODE
1223 
1224 /*!
1225  * \internal
1226  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1227  *
1228  * \param[in] xml  XML of pseudo-action to check
1229  */
1230 void
1231 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1232 {
1233     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1234 
1235     if (numXpathResults(search) == 1) {
1236         xmlNode *result = getXpathResult(search, 0);
1237 
1238         /* Normally, we handle the necessary side effects of a guest node stop
1239          * action when reporting the remote agent's result. However, if the stop
1240          * is implied due to fencing, it will be a fencing pseudo-event, and
1241          * there won't be a result to report. Handle that case here.
1242          *
1243          * This will result in a duplicate call to remote_node_down() if the
1244          * guest stop was real instead of implied, but that shouldn't hurt.
1245          *
1246          * There is still one corner case that isn't handled: if a guest node
1247          * isn't running any resources when its host is fenced, it will appear
1248          * to be cleanly stopped, so there will be no pseudo-fence, and our
1249          * peer cache state will be incorrect unless and until the guest is
1250          * recovered.
1251          */
1252         if (result) {
1253             const char *remote = ID(result);
1254 
1255             if (remote) {
1256                 remote_node_down(remote, DOWN_ERASE_LRM);
1257             }
1258         }
1259     }
1260     freeXpathObject(search);
1261 }
1262 
1263 static void
1264 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1265 {
1266     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1267     xmlNode *update, *state;
1268     int call_opt, call_id = 0;
1269     crm_node_t *node;
1270 
1271     call_opt = crmd_cib_smart_opt();
1272     node = crm_remote_peer_get(lrm_state->node_name);
1273     CRM_CHECK(node != NULL, return);
1274     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1275     state = create_node_state_update(node, node_update_none, update,
1276                                      __func__);
1277     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1278     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
1279     if (call_id < 0) {
1280         crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
1281     } else {
1282         /* TODO: still not 100% sure that async update will succeed ... */
1283         ra_data->is_maintenance = maintenance;
1284     }
1285     free_xml(update);
1286 }
1287 
1288 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1289     "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1290     XML_GRAPH_TAG_MAINTENANCE
1291 
1292 /*!
1293  * \internal
1294  * \brief Check a pseudo-action holding updates for maintenance state
1295  *
1296  * \param[in] xml  XML of pseudo-action to check
1297  */
1298 
1299 void
1300 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1301 {
1302     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1303 
1304     if (numXpathResults(search) == 1) {
1305         xmlNode *node;
1306         int cnt = 0, cnt_remote = 0;
1307 
1308         for (node =
1309                 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1310             node != NULL; node = pcmk__xml_next(node)) {
1311             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1312 
1313             cnt++;
1314             if (lrm_state && lrm_state->remote_ra_data &&
1315                 ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
1316                 int is_maint;
1317 
1318                 cnt_remote++;
1319                 pcmk__scan_min_int(crm_element_value(node, XML_NODE_IS_MAINTENANCE),
1320                                    &is_maint, 0);
1321                 remote_ra_maintenance(lrm_state, is_maint);
1322             }
1323         }
1324         crm_trace("Action holds %d nodes (%d remotes found) "
1325                     "adjusting maintenance-mode", cnt, cnt_remote);
1326     }
1327     freeXpathObject(search);
1328 }
1329 
1330 gboolean
1331 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1332 {
1333     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1334 
1335     return ra_data->is_maintenance;
1336 }
1337 
1338 gboolean
1339 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1340 {
1341     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1342 
1343     return ra_data->controlling_guest;
1344 }

/* [previous][next][first][last][top][bottom][index][help] */