root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. remote_node_up
  6. remote_node_down
  7. check_remote_node_state
  8. report_remote_ra_result
  9. update_remaining_timeout
  10. retry_start_cmd_cb
  11. connection_takeover_timeout_cb
  12. monitor_timeout_cb
  13. synthesize_lrmd_success
  14. remote_lrm_op_callback
  15. handle_remote_ra_stop
  16. handle_remote_ra_start
  17. handle_remote_ra_exec
  18. remote_ra_data_init
  19. remote_ra_cleanup
  20. is_remote_lrmd_ra
  21. remote_ra_get_rsc_info
  22. is_remote_ra_supported_action
  23. fail_all_monitor_cmds
  24. remove_cmd
  25. remote_ra_cancel
  26. handle_dup_monitor
  27. remote_ra_exec
  28. remote_ra_fail
  29. remote_ra_process_pseudo
  30. remote_ra_maintenance
  31. remote_ra_process_maintenance_nodes
  32. remote_ra_is_in_maintenance
  33. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2020 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/services.h>
  17 
  18 #include <pacemaker-controld.h>
  19 
  20 #define REMOTE_LRMD_RA "remote"
  21 
  22 /* The max start timeout before cmd retry */
  23 #define MAX_START_TIMEOUT_MS 10000
  24 
  25 typedef struct remote_ra_cmd_s {
  26     /*! the local node the cmd is issued from */
  27     char *owner;
  28     /*! the remote node the cmd is executed on */
  29     char *rsc_id;
  30     /*! the action to execute */
  31     char *action;
  32     /*! some string the client wants us to give it back */
  33     char *userdata;
  34     char *exit_reason;          // descriptive text on error
  35     /*! start delay in ms */
  36     int start_delay;
  37     /*! timer id used for start delay. */
  38     int delay_id;
  39     /*! timeout in ms for cmd */
  40     int timeout;
  41     int remaining_timeout;
  42     /*! recurring interval in ms */
  43     guint interval_ms;
  44     /*! interval timer id */
  45     int interval_id;
  46     int reported_success;
  47     int monitor_timeout_id;
  48     int takeover_timeout_id;
  49     /*! action parameters */
  50     lrmd_key_value_t *params;
  51     /*! executed rc */
  52     int rc;
  53     int op_status;
  54     int call_id;
  55     time_t start_time;
  56     gboolean cancel;
  57 } remote_ra_cmd_t;
  58 
  59 enum remote_migration_status {
  60     expect_takeover = 1,
  61     takeover_complete,
  62 };
  63 
  64 typedef struct remote_ra_data_s {
  65     crm_trigger_t *work;
  66     remote_ra_cmd_t *cur_cmd;
  67     GList *cmds;
  68     GList *recurring_cmds;
  69 
  70     enum remote_migration_status migrate_status;
  71 
  72     gboolean active;
  73 
  74     /* Maintenance mode is difficult to determine from the controller's context,
  75      * so we have it signalled back with the transition from the scheduler.
  76      */
  77     gboolean is_maintenance;
  78 
  79     /* Similar for whether we are controlling a guest node or remote node.
  80      * Fortunately there is a meta-attribute in the transition already and
  81      * as the situation doesn't change over time we can use the
  82      * resource start for noting down the information for later use when
  83      * the attributes aren't at hand.
  84      */
  85     gboolean controlling_guest;
  86 } remote_ra_data_t;
  87 
  88 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
  89 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
  90 static GList *fail_all_monitor_cmds(GList * list);
  91 
  92 static void
  93 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  94 {
  95     remote_ra_cmd_t *cmd = user_data;
  96 
  97     if (!cmd) {
  98         return;
  99     }
 100     if (cmd->delay_id) {
 101         g_source_remove(cmd->delay_id);
 102     }
 103     if (cmd->interval_id) {
 104         g_source_remove(cmd->interval_id);
 105     }
 106     if (cmd->monitor_timeout_id) {
 107         g_source_remove(cmd->monitor_timeout_id);
 108     }
 109     if (cmd->takeover_timeout_id) {
 110         g_source_remove(cmd->takeover_timeout_id);
 111     }
 112     free(cmd->owner);
 113     free(cmd->rsc_id);
 114     free(cmd->action);
 115     free(cmd->userdata);
 116     free(cmd->exit_reason);
 117     lrmd_key_value_freeall(cmd->params);
 118     free(cmd);
 119 }
 120 
 121 static int
 122 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 123 {
 124     static int remote_ra_callid = 0;
 125 
 126     remote_ra_callid++;
 127     if (remote_ra_callid <= 0) {
 128         remote_ra_callid = 1;
 129     }
 130 
 131     return remote_ra_callid;
 132 }
 133 
 134 static gboolean
 135 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 136 {
 137     remote_ra_cmd_t *cmd = data;
 138     lrm_state_t *connection_rsc = NULL;
 139 
 140     cmd->interval_id = 0;
 141     connection_rsc = lrm_state_find(cmd->rsc_id);
 142     if (connection_rsc && connection_rsc->remote_ra_data) {
 143         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 144 
 145         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 146 
 147         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 148         mainloop_set_trigger(ra_data->work);
 149     }
 150     return FALSE;
 151 }
 152 
 153 static gboolean
 154 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 155 {
 156     remote_ra_cmd_t *cmd = data;
 157     lrm_state_t *connection_rsc = NULL;
 158 
 159     cmd->delay_id = 0;
 160     connection_rsc = lrm_state_find(cmd->rsc_id);
 161     if (connection_rsc && connection_rsc->remote_ra_data) {
 162         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 163 
 164         mainloop_set_trigger(ra_data->work);
 165     }
 166     return FALSE;
 167 }
 168 
 169 /*!
 170  * \internal
 171  * \brief Handle cluster communication related to pacemaker_remote node joining
 172  *
 173  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 174  */
 175 static void
 176 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 177 {
 178     int call_opt, call_id = 0;
 179     xmlNode *update, *state;
 180     crm_node_t *node;
 181     enum controld_section_e section = controld_section_all;
 182 
 183     CRM_CHECK(node_name != NULL, return);
 184     crm_info("Announcing pacemaker_remote node %s", node_name);
 185 
 186     /* Clear node's entire state (resource history and transient attributes)
 187      * other than shutdown locks. The transient attributes should and normally
 188      * will be cleared when the node leaves, but since remote node state has a
 189      * number of corner cases, clear them here as well, to be sure.
 190      */
 191     call_opt = crmd_cib_smart_opt();
 192     if (controld_shutdown_lock_enabled) {
 193         section = controld_section_all_unlocked;
 194     }
 195     controld_delete_node_state(node_name, section, call_opt);
 196 
 197     /* Clear node's probed attribute */
 198     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 199 
 200     /* Ensure node is in the remote peer cache with member status */
 201     node = crm_remote_peer_get(node_name);
 202     CRM_CHECK(node != NULL, return);
 203     crm_update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 204 
 205     /* pacemaker_remote nodes don't participate in the membership layer,
 206      * so cluster nodes don't automatically get notified when they come and go.
 207      * We send a cluster message to the DC, and update the CIB node state entry,
 208      * so the DC will get it sooner (via message) or later (via CIB refresh),
 209      * and any other interested parties can query the CIB.
 210      */
 211     send_remote_state_message(node_name, TRUE);
 212 
 213     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 214     state = create_node_state_update(node, node_update_cluster, update,
 215                                      __func__);
 216 
 217     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 218      * needs to be fenced, this flag will allow various actions to determine
 219      * whether the fencing has happened yet.
 220      */
 221     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 222 
 223     /* TODO: If the remote connection drops, and this (async) CIB update either
 224      * failed or has not yet completed, later actions could mistakenly think the
 225      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 226      * previously set, because it won't have been cleared). This could prevent
 227      * actual fencing or allow recurring monitor failures to be cleared too
 228      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 229      */
 230     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 231     if (call_id < 0) {
 232         crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
 233     }
 234     free_xml(update);
 235 }
 236 
 237 enum down_opts {
 238     DOWN_KEEP_LRM,
 239     DOWN_ERASE_LRM
 240 };
 241 
 242 /*!
 243  * \internal
 244  * \brief Handle cluster communication related to pacemaker_remote node leaving
 245  *
 246  * \param[in] node_name  Name of lost node
 247  * \param[in] opts       Whether to keep or erase LRM history
 248  */
 249 static void
 250 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 251 {
 252     xmlNode *update;
 253     int call_id = 0;
 254     int call_opt = crmd_cib_smart_opt();
 255     crm_node_t *node;
 256 
 257     /* Purge node from attrd's memory */
 258     update_attrd_remote_node_removed(node_name, NULL);
 259 
 260     /* Normally, only node attributes should be erased, and the resource history
 261      * should be kept until the node comes back up. However, after a successful
 262      * fence, we want to clear the history as well, so we don't think resources
 263      * are still running on the node.
 264      */
 265     if (opts == DOWN_ERASE_LRM) {
 266         controld_delete_node_state(node_name, controld_section_all, call_opt);
 267     } else {
 268         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 269     }
 270 
 271     /* Ensure node is in the remote peer cache with lost state */
 272     node = crm_remote_peer_get(node_name);
 273     CRM_CHECK(node != NULL, return);
 274     crm_update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 275 
 276     /* Notify DC */
 277     send_remote_state_message(node_name, FALSE);
 278 
 279     /* Update CIB node state */
 280     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 281     create_node_state_update(node, node_update_cluster, update, __func__);
 282     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 283     if (call_id < 0) {
 284         crm_perror(LOG_ERR, "%s CIB node state update", node_name);
 285     }
 286     free_xml(update);
 287 }
 288 
 289 /*!
 290  * \internal
 291  * \brief Handle effects of a remote RA command on node state
 292  *
 293  * \param[in] cmd  Completed remote RA command
 294  */
 295 static void
 296 check_remote_node_state(remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 297 {
 298     /* Only successful actions can change node state */
 299     if (cmd->rc != PCMK_OCF_OK) {
 300         return;
 301     }
 302 
 303     if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
 304         remote_node_up(cmd->rsc_id);
 305 
 306     } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) {
 307         /* After a successful migration, we don't need to do remote_node_up()
 308          * because the DC already knows the node is up, and we don't want to
 309          * clear LRM history etc. We do need to add the remote node to this
 310          * host's remote peer cache, because (unless it happens to be DC)
 311          * it hasn't been tracking the remote node, and other code relies on
 312          * the cache to distinguish remote nodes from unseen cluster nodes.
 313          */
 314         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 315 
 316         CRM_CHECK(node != NULL, return);
 317         crm_update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 318 
 319     } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 320         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 321         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 322 
 323         if (ra_data) {
 324             if (ra_data->migrate_status != takeover_complete) {
 325                 /* Stop means down if we didn't successfully migrate elsewhere */
 326                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 327             } else if (AM_I_DC == FALSE) {
 328                 /* Only the connection host and DC track node state,
 329                  * so if the connection migrated elsewhere and we aren't DC,
 330                  * un-cache the node, so we don't have stale info
 331                  */
 332                 crm_remote_peer_cache_remove(cmd->rsc_id);
 333             }
 334         }
 335     }
 336 
 337     /* We don't do anything for successful monitors, which is correct for
 338      * routine recurring monitors, and for monitors on nodes where the
 339      * connection isn't supposed to be (the cluster will stop the connection in
 340      * that case). However, if the initial probe finds the connection already
 341      * active on the node where we want it, we probably should do
 342      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 343      * Given that connections have to be initiated by the cluster, the chance of
 344      * that should be close to zero.
 345      */
 346 }
 347 
 348 static void
 349 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 350 {
 351     lrmd_event_data_t op = { 0, };
 352 
 353     check_remote_node_state(cmd);
 354 
 355     op.type = lrmd_event_exec_complete;
 356     op.rsc_id = cmd->rsc_id;
 357     op.op_type = cmd->action;
 358     op.user_data = cmd->userdata;
 359     op.exit_reason = cmd->exit_reason;
 360     op.timeout = cmd->timeout;
 361     op.interval_ms = cmd->interval_ms;
 362     op.rc = cmd->rc;
 363     op.op_status = cmd->op_status;
 364     op.t_run = (unsigned int) cmd->start_time;
 365     op.t_rcchange = (unsigned int) cmd->start_time;
 366     if (cmd->reported_success && cmd->rc != PCMK_OCF_OK) {
 367         op.t_rcchange = (unsigned int) time(NULL);
 368         /* This edge case will likely never ever occur, but if it does the
 369          * result is that a failure will not be processed correctly. This is only
 370          * remotely possible because we are able to detect a connection resource's tcp
 371          * connection has failed at any moment after start has completed. The actual
 372          * recurring operation is just a connectivity ping.
 373          *
 374          * basically, we are not guaranteed that the first successful monitor op and
 375          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 376          * make it look like the operations occurred at separate times though. */
 377         if (op.t_rcchange == op.t_run) {
 378             op.t_rcchange++;
 379         }
 380     }
 381 
 382     if (cmd->params) {
 383         lrmd_key_value_t *tmp;
 384 
 385         op.params = crm_str_table_new();
 386         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 387             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 388         }
 389 
 390     }
 391     op.call_id = cmd->call_id;
 392     op.remote_nodename = cmd->owner;
 393 
 394     lrm_op_callback(&op);
 395 
 396     if (op.params) {
 397         g_hash_table_destroy(op.params);
 398     }
 399 }
 400 
 401 static void
 402 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 403 {
 404     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 405 }
 406 
 407 static gboolean
 408 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410     lrm_state_t *lrm_state = data;
 411     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 412     remote_ra_cmd_t *cmd = NULL;
 413     int rc = -1;
 414 
 415     if (!ra_data || !ra_data->cur_cmd) {
 416         return FALSE;
 417     }
 418     cmd = ra_data->cur_cmd;
 419     if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) {
 420         return FALSE;
 421     }
 422     update_remaining_timeout(cmd);
 423 
 424     if (cmd->remaining_timeout > 0) {
 425         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 426     }
 427 
 428     if (rc != 0) {
 429         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 430         cmd->op_status = PCMK_LRM_OP_ERROR;
 431         report_remote_ra_result(cmd);
 432 
 433         if (ra_data->cmds) {
 434             mainloop_set_trigger(ra_data->work);
 435         }
 436         ra_data->cur_cmd = NULL;
 437         free_cmd(cmd);
 438     } else {
 439         /* wait for connection event */
 440     }
 441 
 442     return FALSE;
 443 }
 444 
 445 
 446 static gboolean
 447 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 448 {
 449     lrm_state_t *lrm_state = NULL;
 450     remote_ra_cmd_t *cmd = data;
 451 
 452     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 453     cmd->takeover_timeout_id = 0;
 454 
 455     lrm_state = lrm_state_find(cmd->rsc_id);
 456 
 457     handle_remote_ra_stop(lrm_state, cmd);
 458     free_cmd(cmd);
 459 
 460     return FALSE;
 461 }
 462 
 463 static gboolean
 464 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 465 {
 466     lrm_state_t *lrm_state = NULL;
 467     remote_ra_cmd_t *cmd = data;
 468 
 469     lrm_state = lrm_state_find(cmd->rsc_id);
 470 
 471     crm_info("Timed out waiting for remote poke response from %s%s",
 472              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 473     cmd->monitor_timeout_id = 0;
 474     cmd->op_status = PCMK_LRM_OP_TIMEOUT;
 475     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 476 
 477     if (lrm_state && lrm_state->remote_ra_data) {
 478         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 479 
 480         if (ra_data->cur_cmd == cmd) {
 481             ra_data->cur_cmd = NULL;
 482         }
 483         if (ra_data->cmds) {
 484             mainloop_set_trigger(ra_data->work);
 485         }
 486     }
 487 
 488     report_remote_ra_result(cmd);
 489     free_cmd(cmd);
 490 
 491     if(lrm_state) {
 492         lrm_state_disconnect(lrm_state);
 493     }
 494     return FALSE;
 495 }
 496 
 497 static void
 498 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 499 {
 500     lrmd_event_data_t op = { 0, };
 501 
 502     if (lrm_state == NULL) {
 503         /* if lrm_state not given assume local */
 504         lrm_state = lrm_state_find(fsa_our_uname);
 505     }
 506     CRM_ASSERT(lrm_state != NULL);
 507 
 508     op.type = lrmd_event_exec_complete;
 509     op.rsc_id = rsc_id;
 510     op.op_type = op_type;
 511     op.rc = PCMK_OCF_OK;
 512     op.op_status = PCMK_LRM_OP_DONE;
 513     op.t_run = (unsigned int) time(NULL);
 514     op.t_rcchange = op.t_run;
 515     op.call_id = generate_callid();
 516     process_lrm_event(lrm_state, &op, NULL, NULL);
 517 }
 518 
 519 void
 520 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 521 {
 522     gboolean cmd_handled = FALSE;
 523     lrm_state_t *lrm_state = NULL;
 524     remote_ra_data_t *ra_data = NULL;
 525     remote_ra_cmd_t *cmd = NULL;
 526 
 527     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 528               "(%d) status=%s (%d)",
 529               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 530               lrmd_event_type2str(op->type), op->remote_nodename,
 531               services_ocf_exitcode_str(op->rc), op->rc,
 532               services_lrm_status_str(op->op_status), op->op_status);
 533 
 534     lrm_state = lrm_state_find(op->remote_nodename);
 535     if (!lrm_state || !lrm_state->remote_ra_data) {
 536         crm_debug("No state information found for remote connection event");
 537         return;
 538     }
 539     ra_data = lrm_state->remote_ra_data;
 540 
 541     if (op->type == lrmd_event_new_client) {
 542         // Another client has connected to the remote daemon
 543 
 544         if (ra_data->migrate_status == expect_takeover) {
 545             // Great, we knew this was coming
 546             ra_data->migrate_status = takeover_complete;
 547 
 548         } else {
 549             crm_err("Unexpected pacemaker_remote client takeover for %s. Disconnecting", op->remote_nodename);
 550             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 551             /* Do not free lrm_state->conn yet. */
 552             /* It'll be freed in the following stop action. */
 553             lrm_state_disconnect_only(lrm_state);
 554         }
 555         return;
 556     }
 557 
 558     /* filter all EXEC events up */
 559     if (op->type == lrmd_event_exec_complete) {
 560         if (ra_data->migrate_status == takeover_complete) {
 561             crm_debug("ignoring event, this connection is taken over by another node");
 562         } else {
 563             lrm_op_callback(op);
 564         }
 565         return;
 566     }
 567 
 568     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 569 
 570         if (ra_data->active == FALSE) {
 571             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 572                       lrm_state->node_name);
 573 
 574         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 575             crm_err("Lost connection to Pacemaker Remote node %s",
 576                     lrm_state->node_name);
 577             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 578             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 579 
 580         } else {
 581             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 582                        lrm_state->node_name);
 583             /* Do roughly what a 'stop' on the remote-resource would do */
 584             handle_remote_ra_stop(lrm_state, NULL);
 585             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 586             /* now fake the reply of a successful 'stop' */
 587             synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
 588         }
 589         return;
 590     }
 591 
 592     if (!ra_data->cur_cmd) {
 593         crm_debug("no event to match");
 594         return;
 595     }
 596 
 597     cmd = ra_data->cur_cmd;
 598 
 599     /* Start actions and migrate from actions complete after connection
 600      * comes back to us. */
 601     if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start",
 602                                                                "migrate_from", NULL)) {
 603         if (op->connection_rc < 0) {
 604             update_remaining_timeout(cmd);
 605 
 606             if (op->connection_rc == -ENOKEY) {
 607                 // Hard error, don't retry
 608                 cmd->op_status = PCMK_LRM_OP_ERROR;
 609                 cmd->rc = PCMK_OCF_INVALID_PARAM;
 610                 cmd->exit_reason = strdup("Authentication key not readable");
 611 
 612             } else if (cmd->remaining_timeout > 3000) {
 613                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 614                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 615                 return;
 616 
 617             } else {
 618                 crm_trace("can't reschedule start, remaining timeout too small %d",
 619                           cmd->remaining_timeout);
 620                 cmd->op_status = PCMK_LRM_OP_TIMEOUT;
 621                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 622             }
 623 
 624         } else {
 625             lrm_state_reset_tables(lrm_state, TRUE);
 626             cmd->rc = PCMK_OCF_OK;
 627             cmd->op_status = PCMK_LRM_OP_DONE;
 628             ra_data->active = TRUE;
 629         }
 630 
 631         crm_debug("Remote connection event matched %s action", cmd->action);
 632         report_remote_ra_result(cmd);
 633         cmd_handled = TRUE;
 634 
 635     } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 636 
 637         if (cmd->monitor_timeout_id) {
 638             g_source_remove(cmd->monitor_timeout_id);
 639             cmd->monitor_timeout_id = 0;
 640         }
 641 
 642         /* Only report success the first time, after that only worry about failures.
 643          * For this function, if we get the poke pack, it is always a success. Pokes
 644          * only fail if the send fails, or the response times out. */
 645         if (!cmd->reported_success) {
 646             cmd->rc = PCMK_OCF_OK;
 647             cmd->op_status = PCMK_LRM_OP_DONE;
 648             report_remote_ra_result(cmd);
 649             cmd->reported_success = 1;
 650         }
 651 
 652         crm_debug("Remote poke event matched %s action", cmd->action);
 653 
 654         /* success, keep rescheduling if interval is present. */
 655         if (cmd->interval_ms && (cmd->cancel == FALSE)) {
 656             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 657             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 658                                              recurring_helper, cmd);
 659             cmd = NULL;         /* prevent free */
 660         }
 661         cmd_handled = TRUE;
 662 
 663     } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 664         if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
 665             cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 666             cmd->op_status = PCMK_LRM_OP_ERROR;
 667             report_remote_ra_result(cmd);
 668             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 669                     lrm_state->node_name);
 670         }
 671         cmd_handled = TRUE;
 672 
 673     } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 674 
 675         handle_remote_ra_stop(lrm_state, cmd);
 676         cmd_handled = TRUE;
 677 
 678     } else {
 679         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 680     }
 681 
 682     if (cmd_handled) {
 683         ra_data->cur_cmd = NULL;
 684         if (ra_data->cmds) {
 685             mainloop_set_trigger(ra_data->work);
 686         }
 687         free_cmd(cmd);
 688     }
 689 }
 690 
 691 static void
 692 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 693 {
 694     remote_ra_data_t *ra_data = NULL;
 695 
 696     CRM_ASSERT(lrm_state);
 697     ra_data = lrm_state->remote_ra_data;
 698 
 699     if (ra_data->migrate_status != takeover_complete) {
 700         /* delete pending ops when ever the remote connection is intentionally stopped */
 701         g_hash_table_remove_all(lrm_state->pending_ops);
 702     } else {
 703         /* we no longer hold the history if this connection has been migrated,
 704          * however, we keep metadata cache for future use */
 705         lrm_state_reset_tables(lrm_state, FALSE);
 706     }
 707 
 708     ra_data->active = FALSE;
 709     lrm_state_disconnect(lrm_state);
 710 
 711     if (ra_data->cmds) {
 712         g_list_free_full(ra_data->cmds, free_cmd);
 713     }
 714     if (ra_data->recurring_cmds) {
 715         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 716     }
 717     ra_data->cmds = NULL;
 718     ra_data->recurring_cmds = NULL;
 719     ra_data->cur_cmd = NULL;
 720 
 721     if (cmd) {
 722         cmd->rc = PCMK_OCF_OK;
 723         cmd->op_status = PCMK_LRM_OP_DONE;
 724 
 725         report_remote_ra_result(cmd);
 726     }
 727 }
 728 
 729 static int
 730 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 731 {
 732     const char *server = NULL;
 733     lrmd_key_value_t *tmp = NULL;
 734     int port = 0;
 735     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 736     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 737 
 738     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 739         if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR,
 740                                  XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) {
 741             server = tmp->value;
 742         } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) {
 743             port = atoi(tmp->value);
 744         } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) {
 745             ra_data->controlling_guest = TRUE;
 746         }
 747     }
 748 
 749     return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used);
 750 }
 751 
 752 static gboolean
 753 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 754 {
 755     int rc = 0;
 756     lrm_state_t *lrm_state = user_data;
 757     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 758     remote_ra_cmd_t *cmd;
 759     GList *first = NULL;
 760 
 761     if (ra_data->cur_cmd) {
 762         /* still waiting on previous cmd */
 763         return TRUE;
 764     }
 765 
 766     while (ra_data->cmds) {
 767         first = ra_data->cmds;
 768         cmd = first->data;
 769         if (cmd->delay_id) {
 770             /* still waiting for start delay timer to trip */
 771             return TRUE;
 772         }
 773 
 774         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 775         g_list_free_1(first);
 776 
 777         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
 778             ra_data->migrate_status = 0;
 779             rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout);
 780             if (rc == 0) {
 781                 /* take care of this later when we get async connection result */
 782                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 783                           cmd->action);
 784                 ra_data->cur_cmd = cmd;
 785                 return TRUE;
 786             } else {
 787                 crm_debug("Could not initiate remote connection for %s action",
 788                           cmd->action);
 789                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 790                 cmd->op_status = PCMK_LRM_OP_ERROR;
 791             }
 792             report_remote_ra_result(cmd);
 793 
 794         } else if (!strcmp(cmd->action, "monitor")) {
 795 
 796             if (lrm_state_is_connected(lrm_state) == TRUE) {
 797                 rc = lrm_state_poke_connection(lrm_state);
 798                 if (rc < 0) {
 799                     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 800                     cmd->op_status = PCMK_LRM_OP_ERROR;
 801                 }
 802             } else {
 803                 rc = -1;
 804                 cmd->op_status = PCMK_LRM_OP_DONE;
 805                 cmd->rc = PCMK_OCF_NOT_RUNNING;
 806             }
 807 
 808             if (rc == 0) {
 809                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 810                           cmd->rsc_id);
 811                 ra_data->cur_cmd = cmd;
 812                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 813                 return TRUE;
 814             }
 815             report_remote_ra_result(cmd);
 816 
 817         } else if (!strcmp(cmd->action, "stop")) {
 818 
 819             if (ra_data->migrate_status == expect_takeover) {
 820                 /* briefly wait on stop for the takeover event to occur. If the
 821                  * takeover event does not occur during the wait period, that's fine.
 822                  * It just means that the remote-node's lrm_status section is going to get
 823                  * cleared which will require all the resources running in the remote-node
 824                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 825                  * successfully, then we can leave the status section intact. */
 826                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 827                 ra_data->cur_cmd = cmd;
 828                 return TRUE;
 829             }
 830 
 831             handle_remote_ra_stop(lrm_state, cmd);
 832 
 833         } else if (!strcmp(cmd->action, "migrate_to")) {
 834             ra_data->migrate_status = expect_takeover;
 835             cmd->rc = PCMK_OCF_OK;
 836             cmd->op_status = PCMK_LRM_OP_DONE;
 837             report_remote_ra_result(cmd);
 838         } else if (!strcmp(cmd->action, "reload")) {
 839             /* reloads are a no-op right now, add logic here when they become important */
 840             cmd->rc = PCMK_OCF_OK;
 841             cmd->op_status = PCMK_LRM_OP_DONE;
 842             report_remote_ra_result(cmd);
 843         }
 844 
 845         free_cmd(cmd);
 846     }
 847 
 848     return TRUE;
 849 }
 850 
 851 static void
 852 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 853 {
 854     remote_ra_data_t *ra_data = NULL;
 855 
 856     if (lrm_state->remote_ra_data) {
 857         return;
 858     }
 859 
 860     ra_data = calloc(1, sizeof(remote_ra_data_t));
 861     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
 862     lrm_state->remote_ra_data = ra_data;
 863 }
 864 
 865 void
 866 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 867 {
 868     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 869 
 870     if (!ra_data) {
 871         return;
 872     }
 873 
 874     if (ra_data->cmds) {
 875         g_list_free_full(ra_data->cmds, free_cmd);
 876     }
 877 
 878     if (ra_data->recurring_cmds) {
 879         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 880     }
 881     mainloop_destroy_trigger(ra_data->work);
 882     free(ra_data);
 883     lrm_state->remote_ra_data = NULL;
 884 }
 885 
 886 gboolean
 887 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
 888 {
 889     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
 890         return TRUE;
 891     }
 892     if (id && lrm_state_find(id) && !pcmk__str_eq(id, fsa_our_uname, pcmk__str_casei)) {
 893         return TRUE;
 894     }
 895 
 896     return FALSE;
 897 }
 898 
 899 lrmd_rsc_info_t *
 900 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 901 {
 902     lrmd_rsc_info_t *info = NULL;
 903 
 904     if ((lrm_state_find(rsc_id))) {
 905         info = calloc(1, sizeof(lrmd_rsc_info_t));
 906 
 907         info->id = strdup(rsc_id);
 908         info->type = strdup(REMOTE_LRMD_RA);
 909         info->standard = strdup(PCMK_RESOURCE_CLASS_OCF);
 910         info->provider = strdup("pacemaker");
 911     }
 912 
 913     return info;
 914 }
 915 
 916 static gboolean
 917 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 918 {
 919     if (!action) {
 920         return FALSE;
 921     } else if (strcmp(action, "start") &&
 922                strcmp(action, "stop") &&
 923                strcmp(action, "reload") &&
 924                strcmp(action, "migrate_to") &&
 925                strcmp(action, "migrate_from") && strcmp(action, "monitor")) {
 926         return FALSE;
 927     }
 928 
 929     return TRUE;
 930 }
 931 
 932 static GList *
 933 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
 934 {
 935     GList *rm_list = NULL;
 936     remote_ra_cmd_t *cmd = NULL;
 937     GListPtr gIter = NULL;
 938 
 939     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 940         cmd = gIter->data;
 941         if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 942             rm_list = g_list_append(rm_list, cmd);
 943         }
 944     }
 945 
 946     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
 947         cmd = gIter->data;
 948 
 949         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 950         cmd->op_status = PCMK_LRM_OP_ERROR;
 951         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
 952                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
 953         report_remote_ra_result(cmd);
 954 
 955         list = g_list_remove(list, cmd);
 956         free_cmd(cmd);
 957     }
 958 
 959     /* frees only the list data, not the cmds */
 960     g_list_free(rm_list);
 961     return list;
 962 }
 963 
 964 static GList *
 965 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 966 {
 967     remote_ra_cmd_t *cmd = NULL;
 968     GListPtr gIter = NULL;
 969 
 970     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 971         cmd = gIter->data;
 972         if ((cmd->interval_ms == interval_ms)
 973             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
 974             break;
 975         }
 976         cmd = NULL;
 977     }
 978     if (cmd) {
 979         list = g_list_remove(list, cmd);
 980         free_cmd(cmd);
 981     }
 982     return list;
 983 }
 984 
 985 int
 986 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
 987                  const char *action, guint interval_ms)
 988 {
 989     lrm_state_t *connection_rsc = NULL;
 990     remote_ra_data_t *ra_data = NULL;
 991 
 992     connection_rsc = lrm_state_find(rsc_id);
 993     if (!connection_rsc || !connection_rsc->remote_ra_data) {
 994         return -EINVAL;
 995     }
 996 
 997     ra_data = connection_rsc->remote_ra_data;
 998     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
 999     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1000                                          interval_ms);
1001     if (ra_data->cur_cmd &&
1002         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1003         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1004 
1005         ra_data->cur_cmd->cancel = TRUE;
1006     }
1007 
1008     return 0;
1009 }
1010 
1011 static remote_ra_cmd_t *
1012 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1013                    const char *userdata)
1014 {
1015     GList *gIter = NULL;
1016     remote_ra_cmd_t *cmd = NULL;
1017 
1018     /* there are 3 places a potential duplicate monitor operation
1019      * could exist.
1020      * 1. recurring_cmds list. where the op is waiting for its next interval
1021      * 2. cmds list, where the op is queued to get executed immediately
1022      * 3. cur_cmd, which means the monitor op is in flight right now.
1023      */
1024     if (interval_ms == 0) {
1025         return NULL;
1026     }
1027 
1028     if (ra_data->cur_cmd &&
1029         ra_data->cur_cmd->cancel == FALSE &&
1030         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1031         pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) {
1032 
1033         cmd = ra_data->cur_cmd;
1034         goto handle_dup;
1035     }
1036 
1037     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1038         cmd = gIter->data;
1039         if ((cmd->interval_ms == interval_ms)
1040             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1041             goto handle_dup;
1042         }
1043     }
1044 
1045     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1046         cmd = gIter->data;
1047         if ((cmd->interval_ms == interval_ms)
1048             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1049             goto handle_dup;
1050         }
1051     }
1052 
1053     return NULL;
1054 
1055 handle_dup:
1056 
1057     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1058               cmd->rsc_id, "monitor", interval_ms);
1059 
1060     /* update the userdata */
1061     if (userdata) {
1062        free(cmd->userdata);
1063        cmd->userdata = strdup(userdata);
1064     }
1065 
1066     /* if we've already reported success, generate a new call id */
1067     if (cmd->reported_success) {
1068         cmd->start_time = time(NULL);
1069         cmd->call_id = generate_callid();
1070         cmd->reported_success = 0;
1071     }
1072 
1073     /* if we have an interval_id set, that means we are in the process of
1074      * waiting for this cmd's next interval. instead of waiting, cancel
1075      * the timer and execute the action immediately */
1076     if (cmd->interval_id) {
1077         g_source_remove(cmd->interval_id);
1078         cmd->interval_id = 0;
1079         recurring_helper(cmd);
1080     }
1081 
1082     return cmd;  
1083 }
1084 
1085 int
1086 remote_ra_exec(lrm_state_t *lrm_state, const char *rsc_id, const char *action,
     /* [previous][next][first][last][top][bottom][index][help] */
1087                const char *userdata, guint interval_ms,
1088                int timeout,     /* ms */
1089                int start_delay, /* ms */
1090                lrmd_key_value_t * params)
1091 {
1092     int rc = 0;
1093     lrm_state_t *connection_rsc = NULL;
1094     remote_ra_cmd_t *cmd = NULL;
1095     remote_ra_data_t *ra_data = NULL;
1096 
1097     if (is_remote_ra_supported_action(action) == FALSE) {
1098         rc = -EINVAL;
1099         goto exec_done;
1100     }
1101 
1102     connection_rsc = lrm_state_find(rsc_id);
1103     if (!connection_rsc) {
1104         rc = -EINVAL;
1105         goto exec_done;
1106     }
1107 
1108     remote_ra_data_init(connection_rsc);
1109     ra_data = connection_rsc->remote_ra_data;
1110 
1111     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1112     if (cmd) {
1113         rc = cmd->call_id;
1114         goto exec_done;
1115     }
1116 
1117     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1118     cmd->owner = strdup(lrm_state->node_name);
1119     cmd->rsc_id = strdup(rsc_id);
1120     cmd->action = strdup(action);
1121     cmd->userdata = strdup(userdata);
1122     cmd->interval_ms = interval_ms;
1123     cmd->timeout = timeout;
1124     cmd->start_delay = start_delay;
1125     cmd->params = params;
1126     cmd->start_time = time(NULL);
1127 
1128     cmd->call_id = generate_callid();
1129 
1130     if (cmd->start_delay) {
1131         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1132     }
1133 
1134     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1135     mainloop_set_trigger(ra_data->work);
1136 
1137     return cmd->call_id;
1138   exec_done:
1139 
1140     lrmd_key_value_freeall(params);
1141     return rc;
1142 }
1143 
1144 /*!
1145  * \internal
1146  * \brief Immediately fail all monitors of a remote node, if proxied here
1147  *
1148  * \param[in] node_name  Name of pacemaker_remote node
1149  */
1150 void
1151 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1152 {
1153     lrm_state_t *lrm_state = lrm_state_find(node_name);
1154 
1155     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1156         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1157 
1158         crm_info("Failing monitors on pacemaker_remote node %s", node_name);
1159         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1160         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1161     }
1162 }
1163 
1164 /* A guest node fencing implied by host fencing looks like:
1165  *
1166  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1167  *                on_node="lxc1" on_node_uuid="lxc1">
1168  *     <attributes CRM_meta_master_lxc_ms="10" CRM_meta_on_node="lxc1"
1169  *                 CRM_meta_on_node_uuid="lxc1" CRM_meta_stonith_action="off"
1170  *                 crm_feature_set="3.0.12"/>
1171  *     <downed>
1172  *       <node id="lxc1"/>
1173  *     </downed>
1174  *  </pseudo_event>
1175  */
1176 #define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1177     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1178     "/" XML_CIB_TAG_NODE
1179 
1180 /*!
1181  * \internal
1182  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1183  *
1184  * \param[in] xml  XML of pseudo-action to check
1185  */
1186 void
1187 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1188 {
1189     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1190 
1191     if (numXpathResults(search) == 1) {
1192         xmlNode *result = getXpathResult(search, 0);
1193 
1194         /* Normally, we handle the necessary side effects of a guest node stop
1195          * action when reporting the remote agent's result. However, if the stop
1196          * is implied due to fencing, it will be a fencing pseudo-event, and
1197          * there won't be a result to report. Handle that case here.
1198          *
1199          * This will result in a duplicate call to remote_node_down() if the
1200          * guest stop was real instead of implied, but that shouldn't hurt.
1201          *
1202          * There is still one corner case that isn't handled: if a guest node
1203          * isn't running any resources when its host is fenced, it will appear
1204          * to be cleanly stopped, so there will be no pseudo-fence, and our
1205          * peer cache state will be incorrect unless and until the guest is
1206          * recovered.
1207          */
1208         if (result) {
1209             const char *remote = ID(result);
1210 
1211             if (remote) {
1212                 remote_node_down(remote, DOWN_ERASE_LRM);
1213             }
1214         }
1215     }
1216     freeXpathObject(search);
1217 }
1218 
1219 static void
1220 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1221 {
1222     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1223     xmlNode *update, *state;
1224     int call_opt, call_id = 0;
1225     crm_node_t *node;
1226 
1227     call_opt = crmd_cib_smart_opt();
1228     node = crm_remote_peer_get(lrm_state->node_name);
1229     CRM_CHECK(node != NULL, return);
1230     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1231     state = create_node_state_update(node, node_update_none, update,
1232                                      __func__);
1233     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1234     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
1235     if (call_id < 0) {
1236         crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
1237     } else {
1238         /* TODO: still not 100% sure that async update will succeed ... */
1239         ra_data->is_maintenance = maintenance;
1240     }
1241     free_xml(update);
1242 }
1243 
1244 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1245     "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1246     XML_GRAPH_TAG_MAINTENANCE
1247 
1248 /*!
1249  * \internal
1250  * \brief Check a pseudo-action holding updates for maintenance state
1251  *
1252  * \param[in] xml  XML of pseudo-action to check
1253  */
1254 
1255 void
1256 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1257 {
1258     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1259 
1260     if (numXpathResults(search) == 1) {
1261         xmlNode *node;
1262         int cnt = 0, cnt_remote = 0;
1263 
1264         for (node =
1265                 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1266             node != NULL; node = pcmk__xml_next(node)) {
1267             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1268 
1269             cnt++;
1270             if (lrm_state && lrm_state->remote_ra_data &&
1271                 ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
1272                 cnt_remote++;
1273                 remote_ra_maintenance(lrm_state,
1274                                         crm_atoi(crm_element_value(node,
1275                                             XML_NODE_IS_MAINTENANCE), "0"));
1276 
1277             }
1278         }
1279         crm_trace("Action holds %d nodes (%d remotes found) "
1280                     "adjusting maintenance-mode", cnt, cnt_remote);
1281     }
1282     freeXpathObject(search);
1283 }
1284 
1285 gboolean
1286 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1287 {
1288     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1289 
1290     return ra_data->is_maintenance;
1291 }
1292 
1293 gboolean
1294 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1295 {
1296     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1297 
1298     return ra_data->controlling_guest;
1299 }

/* [previous][next][first][last][top][bottom][index][help] */