root/crmd/remote_lrmd_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. remote_node_up
  6. remote_node_down
  7. check_remote_node_state
  8. report_remote_ra_result
  9. update_remaining_timeout
  10. retry_start_cmd_cb
  11. connection_takeover_timeout_cb
  12. monitor_timeout_cb
  13. synthesize_lrmd_success
  14. remote_lrm_op_callback
  15. handle_remote_ra_stop
  16. handle_remote_ra_start
  17. handle_remote_ra_exec
  18. remote_ra_data_init
  19. remote_ra_cleanup
  20. is_remote_lrmd_ra
  21. remote_ra_get_rsc_info
  22. is_remote_ra_supported_action
  23. fail_all_monitor_cmds
  24. remove_cmd
  25. remote_ra_cancel
  26. handle_dup_monitor
  27. remote_ra_exec
  28. remote_ra_fail
  29. remote_ra_process_pseudo
  30. remote_ra_maintenance
  31. remote_ra_process_maintenance_nodes
  32. remote_ra_is_in_maintenance

   1 /* 
   2  * Copyright (C) 2013 David Vossel <davidvossel@gmail.com>
   3  * 
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  * 
   9  * This software is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  * 
  14  * You should have received a copy of the GNU General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 
  19 #include <crm_internal.h>
  20 #include <crm/crm.h>
  21 #include <crm/msg_xml.h>
  22 
  23 #include <crmd.h>
  24 #include <crmd_fsa.h>
  25 #include <crmd_messages.h>
  26 #include <crmd_callbacks.h>
  27 #include <crmd_lrm.h>
  28 #include <crm/lrmd.h>
  29 #include <crm/services.h>
  30 
  31 #define REMOTE_LRMD_RA "remote"
  32 
  33 /* The max start timeout before cmd retry */
  34 #define MAX_START_TIMEOUT_MS 10000
  35 
  36 typedef struct remote_ra_cmd_s {
  37     /*! the local node the cmd is issued from */
  38     char *owner;
  39     /*! the remote node the cmd is executed on */
  40     char *rsc_id;
  41     /*! the action to execute */
  42     char *action;
  43     /*! some string the client wants us to give it back */
  44     char *userdata;
  45     char *exit_reason;          // descriptive text on error
  46     /*! start delay in ms */
  47     int start_delay;
  48     /*! timer id used for start delay. */
  49     int delay_id;
  50     /*! timeout in ms for cmd */
  51     int timeout;
  52     int remaining_timeout;
  53     /*! recurring interval in ms */
  54     int interval;
  55     /*! interval timer id */
  56     int interval_id;
  57     int reported_success;
  58     int monitor_timeout_id;
  59     int takeover_timeout_id;
  60     /*! action parameters */
  61     lrmd_key_value_t *params;
  62     /*! executed rc */
  63     int rc;
  64     int op_status;
  65     int call_id;
  66     time_t start_time;
  67     gboolean cancel;
  68 } remote_ra_cmd_t;
  69 
  70 enum remote_migration_status {
  71     expect_takeover = 1,
  72     takeover_complete,
  73 };
  74 
  75 typedef struct remote_ra_data_s {
  76     crm_trigger_t *work;
  77     remote_ra_cmd_t *cur_cmd;
  78     GList *cmds;
  79     GList *recurring_cmds;
  80 
  81     enum remote_migration_status migrate_status;
  82 
  83     gboolean active;
  84     gboolean is_maintenance; /* kind of complex to determine from crmd-context
  85                               * so we have it signalled back with the
  86                               * transition from pengine
  87                               */
  88 } remote_ra_data_t;
  89 
  90 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
  91 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
  92 static GList *fail_all_monitor_cmds(GList * list);
  93 
  94 static void
  95 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  96 {
  97     remote_ra_cmd_t *cmd = user_data;
  98 
  99     if (!cmd) {
 100         return;
 101     }
 102     if (cmd->delay_id) {
 103         g_source_remove(cmd->delay_id);
 104     }
 105     if (cmd->interval_id) {
 106         g_source_remove(cmd->interval_id);
 107     }
 108     if (cmd->monitor_timeout_id) {
 109         g_source_remove(cmd->monitor_timeout_id);
 110     }
 111     if (cmd->takeover_timeout_id) {
 112         g_source_remove(cmd->takeover_timeout_id);
 113     }
 114     free(cmd->owner);
 115     free(cmd->rsc_id);
 116     free(cmd->action);
 117     free(cmd->userdata);
 118     free(cmd->exit_reason);
 119     lrmd_key_value_freeall(cmd->params);
 120     free(cmd);
 121 }
 122 
 123 static int
 124 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 125 {
 126     static int remote_ra_callid = 0;
 127 
 128     remote_ra_callid++;
 129     if (remote_ra_callid <= 0) {
 130         remote_ra_callid = 1;
 131     }
 132 
 133     return remote_ra_callid;
 134 }
 135 
 136 static gboolean
 137 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 138 {
 139     remote_ra_cmd_t *cmd = data;
 140     lrm_state_t *connection_rsc = NULL;
 141 
 142     cmd->interval_id = 0;
 143     connection_rsc = lrm_state_find(cmd->rsc_id);
 144     if (connection_rsc && connection_rsc->remote_ra_data) {
 145         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 146 
 147         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 148 
 149         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 150         mainloop_set_trigger(ra_data->work);
 151     }
 152     return FALSE;
 153 }
 154 
 155 static gboolean
 156 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 157 {
 158     remote_ra_cmd_t *cmd = data;
 159     lrm_state_t *connection_rsc = NULL;
 160 
 161     cmd->delay_id = 0;
 162     connection_rsc = lrm_state_find(cmd->rsc_id);
 163     if (connection_rsc && connection_rsc->remote_ra_data) {
 164         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 165 
 166         mainloop_set_trigger(ra_data->work);
 167     }
 168     return FALSE;
 169 }
 170 
 171 /*!
 172  * \internal
 173  * \brief Handle cluster communication related to pacemaker_remote node joining
 174  *
 175  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 176  */
 177 static void
 178 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 179 {
 180     int call_opt, call_id = 0;
 181     xmlNode *update, *state;
 182     crm_node_t *node;
 183 
 184     CRM_CHECK(node_name != NULL, return);
 185     crm_info("Announcing pacemaker_remote node %s", node_name);
 186 
 187     /* Clear node's operation history. The node's transient attributes should
 188      * and normally will be cleared when the node leaves, but since remote node
 189      * state has a number of corner cases, clear them here as well, to be sure.
 190      */
 191     call_opt = crmd_cib_smart_opt();
 192     erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
 193     erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
 194 
 195     /* Clear node's probed attribute */
 196     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 197 
 198     /* Ensure node is in the remote peer cache with member status */
 199     node = crm_remote_peer_get(node_name);
 200     CRM_CHECK(node != NULL, return);
 201     crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0);
 202 
 203     /* pacemaker_remote nodes don't participate in the membership layer,
 204      * so cluster nodes don't automatically get notified when they come and go.
 205      * We send a cluster message to the DC, and update the CIB node state entry,
 206      * so the DC will get it sooner (via message) or later (via CIB refresh),
 207      * and any other interested parties can query the CIB.
 208      */
 209     send_remote_state_message(node_name, TRUE);
 210 
 211     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 212     state = create_node_state_update(node, node_update_cluster, update,
 213                                      __FUNCTION__);
 214 
 215     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 216      * needs to be fenced, this flag will allow various actions to determine
 217      * whether the fencing has happened yet.
 218      */
 219     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 220 
 221     /* TODO: If the remote connection drops, and this (async) CIB update either
 222      * failed or has not yet completed, later actions could mistakenly think the
 223      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 224      * previously set, because it won't have been cleared). This could prevent
 225      * actual fencing or allow recurring monitor failures to be cleared too
 226      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 227      */
 228     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 229     if (call_id < 0) {
 230         crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
 231     }
 232     free_xml(update);
 233 }
 234 
 235 enum down_opts {
 236     DOWN_KEEP_LRM,
 237     DOWN_ERASE_LRM
 238 };
 239 
 240 /*!
 241  * \internal
 242  * \brief Handle cluster communication related to pacemaker_remote node leaving
 243  *
 244  * \param[in] node_name  Name of lost node
 245  * \param[in] opts       Whether to keep or erase LRM history
 246  */
 247 static void
 248 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 249 {
 250     xmlNode *update;
 251     int call_id = 0;
 252     int call_opt = crmd_cib_smart_opt();
 253     crm_node_t *node;
 254 
 255     /* Purge node from attrd's memory */
 256     update_attrd_remote_node_removed(node_name, NULL);
 257 
 258     /* Purge node's transient attributes */
 259     erase_status_tag(node_name, XML_TAG_TRANSIENT_NODEATTRS, call_opt);
 260 
 261     /* Normally, the LRM operation history should be kept until the node comes
 262      * back up. However, after a successful fence, we want to clear it, so we
 263      * don't think resources are still running on the node.
 264      */
 265     if (opts == DOWN_ERASE_LRM) {
 266         erase_status_tag(node_name, XML_CIB_TAG_LRM, call_opt);
 267     }
 268 
 269     /* Ensure node is in the remote peer cache with lost state */
 270     node = crm_remote_peer_get(node_name);
 271     CRM_CHECK(node != NULL, return);
 272     crm_update_peer_state(__FUNCTION__, node, CRM_NODE_LOST, 0);
 273 
 274     /* Notify DC */
 275     send_remote_state_message(node_name, FALSE);
 276 
 277     /* Update CIB node state */
 278     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 279     create_node_state_update(node, node_update_cluster, update, __FUNCTION__);
 280     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 281     if (call_id < 0) {
 282         crm_perror(LOG_ERR, "%s CIB node state update", node_name);
 283     }
 284     free_xml(update);
 285 }
 286 
 287 /*!
 288  * \internal
 289  * \brief Handle effects of a remote RA command on node state
 290  *
 291  * \param[in] cmd  Completed remote RA command
 292  */
 293 static void
 294 check_remote_node_state(remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 295 {
 296     /* Only successful actions can change node state */
 297     if (cmd->rc != PCMK_OCF_OK) {
 298         return;
 299     }
 300 
 301     if (safe_str_eq(cmd->action, "start")) {
 302         remote_node_up(cmd->rsc_id);
 303 
 304     } else if (safe_str_eq(cmd->action, "migrate_from")) {
 305         /* After a successful migration, we don't need to do remote_node_up()
 306          * because the DC already knows the node is up, and we don't want to
 307          * clear LRM history etc. We do need to add the remote node to this
 308          * host's remote peer cache, because (unless it happens to be DC)
 309          * it hasn't been tracking the remote node, and other code relies on
 310          * the cache to distinguish remote nodes from unseen cluster nodes.
 311          */
 312         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 313 
 314         CRM_CHECK(node != NULL, return);
 315         crm_update_peer_state(__FUNCTION__, node, CRM_NODE_MEMBER, 0);
 316 
 317     } else if (safe_str_eq(cmd->action, "stop")) {
 318         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 319         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 320 
 321         if (ra_data) {
 322             if (ra_data->migrate_status != takeover_complete) {
 323                 /* Stop means down if we didn't successfully migrate elsewhere */
 324                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 325             } else if (AM_I_DC == FALSE) {
 326                 /* Only the connection host and DC track node state,
 327                  * so if the connection migrated elsewhere and we aren't DC,
 328                  * un-cache the node, so we don't have stale info
 329                  */
 330                 crm_remote_peer_cache_remove(cmd->rsc_id);
 331             }
 332         }
 333     }
 334 
 335     /* We don't do anything for successful monitors, which is correct for
 336      * routine recurring monitors, and for monitors on nodes where the
 337      * connection isn't supposed to be (the cluster will stop the connection in
 338      * that case). However, if the initial probe finds the connection already
 339      * active on the node where we want it, we probably should do
 340      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 341      * Given that connections have to be initiated by the cluster, the chance of
 342      * that should be close to zero.
 343      */
 344 }
 345 
 346 static void
 347 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 348 {
 349     lrmd_event_data_t op = { 0, };
 350 
 351     check_remote_node_state(cmd);
 352 
 353     op.type = lrmd_event_exec_complete;
 354     op.rsc_id = cmd->rsc_id;
 355     op.op_type = cmd->action;
 356     op.user_data = cmd->userdata;
 357     op.exit_reason = cmd->exit_reason;
 358     op.timeout = cmd->timeout;
 359     op.interval = cmd->interval;
 360     op.rc = cmd->rc;
 361     op.op_status = cmd->op_status;
 362     op.t_run = cmd->start_time;
 363     op.t_rcchange = cmd->start_time;
 364     if (cmd->reported_success && cmd->rc != PCMK_OCF_OK) {
 365         op.t_rcchange = time(NULL);
 366         /* This edge case will likely never ever occur, but if it does the
 367          * result is that a failure will not be processed correctly. This is only
 368          * remotely possible because we are able to detect a connection resource's tcp
 369          * connection has failed at any moment after start has completed. The actual
 370          * recurring operation is just a connectivity ping.
 371          *
 372          * basically, we are not guaranteed that the first successful monitor op and
 373          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 374          * make it look like the operations occurred at separate times though. */
 375         if (op.t_rcchange == op.t_run) {
 376             op.t_rcchange++;
 377         }
 378     }
 379 
 380     if (cmd->params) {
 381         lrmd_key_value_t *tmp;
 382 
 383         op.params = crm_str_table_new();
 384         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 385             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 386         }
 387 
 388     }
 389     op.call_id = cmd->call_id;
 390     op.remote_nodename = cmd->owner;
 391 
 392     lrm_op_callback(&op);
 393 
 394     if (op.params) {
 395         g_hash_table_destroy(op.params);
 396     }
 397 }
 398 
 399 static void
 400 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 401 {
 402     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 403 }
 404 
 405 static gboolean
 406 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 407 {
 408     lrm_state_t *lrm_state = data;
 409     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 410     remote_ra_cmd_t *cmd = NULL;
 411     int rc = -1;
 412 
 413     if (!ra_data || !ra_data->cur_cmd) {
 414         return FALSE;
 415     }
 416     cmd = ra_data->cur_cmd;
 417     if (safe_str_neq(cmd->action, "start") && safe_str_neq(cmd->action, "migrate_from")) {
 418         return FALSE;
 419     }
 420     update_remaining_timeout(cmd);
 421 
 422     if (cmd->remaining_timeout > 0) {
 423         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 424     }
 425 
 426     if (rc != 0) {
 427         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 428         cmd->op_status = PCMK_LRM_OP_ERROR;
 429         report_remote_ra_result(cmd);
 430 
 431         if (ra_data->cmds) {
 432             mainloop_set_trigger(ra_data->work);
 433         }
 434         ra_data->cur_cmd = NULL;
 435         free_cmd(cmd);
 436     } else {
 437         /* wait for connection event */
 438     }
 439 
 440     return FALSE;
 441 }
 442 
 443 
 444 static gboolean
 445 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 446 {
 447     lrm_state_t *lrm_state = NULL;
 448     remote_ra_cmd_t *cmd = data;
 449 
 450     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 451     cmd->takeover_timeout_id = 0;
 452 
 453     lrm_state = lrm_state_find(cmd->rsc_id);
 454 
 455     handle_remote_ra_stop(lrm_state, cmd);
 456     free_cmd(cmd);
 457 
 458     return FALSE;
 459 }
 460 
 461 static gboolean
 462 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 463 {
 464     lrm_state_t *lrm_state = NULL;
 465     remote_ra_cmd_t *cmd = data;
 466 
 467     lrm_state = lrm_state_find(cmd->rsc_id);
 468 
 469     crm_info("Poke async response timed out for node %s (%p)", cmd->rsc_id, lrm_state);
 470     cmd->monitor_timeout_id = 0;
 471     cmd->op_status = PCMK_LRM_OP_TIMEOUT;
 472     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 473 
 474     if (lrm_state && lrm_state->remote_ra_data) {
 475         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 476 
 477         if (ra_data->cur_cmd == cmd) {
 478             ra_data->cur_cmd = NULL;
 479         }
 480         if (ra_data->cmds) {
 481             mainloop_set_trigger(ra_data->work);
 482         }
 483     }
 484 
 485     report_remote_ra_result(cmd);
 486     free_cmd(cmd);
 487 
 488     if(lrm_state) {
 489         lrm_state_disconnect(lrm_state);
 490     }
 491     return FALSE;
 492 }
 493 
 494 static void
 495 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 496 {
 497     lrmd_event_data_t op = { 0, };
 498 
 499     if (lrm_state == NULL) {
 500         /* if lrm_state not given assume local */
 501         lrm_state = lrm_state_find(fsa_our_uname);
 502     }
 503     CRM_ASSERT(lrm_state != NULL);
 504 
 505     op.type = lrmd_event_exec_complete;
 506     op.rsc_id = rsc_id;
 507     op.op_type = op_type;
 508     op.rc = PCMK_OCF_OK;
 509     op.op_status = PCMK_LRM_OP_DONE;
 510     op.t_run = time(NULL);
 511     op.t_rcchange = op.t_run;
 512     op.call_id = generate_callid();
 513     process_lrm_event(lrm_state, &op, NULL);
 514 }
 515 
 516 void
 517 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 518 {
 519     gboolean cmd_handled = FALSE;
 520     lrm_state_t *lrm_state = NULL;
 521     remote_ra_data_t *ra_data = NULL;
 522     remote_ra_cmd_t *cmd = NULL;
 523 
 524     crm_debug("remote connection event - event_type:%s node:%s action:%s rc:%s op_status:%s",
 525               lrmd_event_type2str(op->type),
 526               op->remote_nodename,
 527               op->op_type ? op->op_type : "none",
 528               services_ocf_exitcode_str(op->rc), services_lrm_status_str(op->op_status));
 529 
 530     lrm_state = lrm_state_find(op->remote_nodename);
 531     if (!lrm_state || !lrm_state->remote_ra_data) {
 532         crm_debug("lrm_state info not found for remote lrmd connection event");
 533         return;
 534     }
 535     ra_data = lrm_state->remote_ra_data;
 536 
 537     /* Another client has connected to the remote daemon,
 538      * determine if this is expected. */
 539     if (op->type == lrmd_event_new_client) {
 540         /* great, we new this was coming */
 541         if (ra_data->migrate_status == expect_takeover) {
 542             ra_data->migrate_status = takeover_complete;
 543         } else {
 544             crm_err("Unexpected pacemaker_remote client takeover for %s. Disconnecting", op->remote_nodename);
 545             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 546             /* Do not free lrm_state->conn yet. */
 547             /* It'll be freed in the following stop action. */
 548             lrm_state_disconnect_only(lrm_state);
 549         }
 550         return;
 551     }
 552 
 553     /* filter all EXEC events up */
 554     if (op->type == lrmd_event_exec_complete) {
 555         if (ra_data->migrate_status == takeover_complete) {
 556             crm_debug("ignoring event, this connection is taken over by another node");
 557         } else {
 558             lrm_op_callback(op);
 559         }
 560         return;
 561     }
 562 
 563     if ((op->type == lrmd_event_disconnect) &&
 564         (ra_data->cur_cmd == NULL) &&
 565         (ra_data->active == TRUE)) {
 566 
 567         if (!remote_ra_is_in_maintenance(lrm_state)) {
 568             crm_err("Unexpected disconnect on remote-node %s", lrm_state->node_name);
 569             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 570             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 571         } else {
 572             crm_notice("Disconnect on unmanaged remote-node %s", lrm_state->node_name);
 573             /* Do roughly what a 'stop' on the remote-resource would do */
 574             handle_remote_ra_stop(lrm_state, NULL);
 575             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 576             /* now fake the reply of a successful 'stop' */
 577             synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
 578         }
 579         return;
 580     }
 581 
 582     if (!ra_data->cur_cmd) {
 583         crm_debug("no event to match");
 584         return;
 585     }
 586 
 587     cmd = ra_data->cur_cmd;
 588 
 589     /* Start actions and migrate from actions complete after connection
 590      * comes back to us. */
 591     if (op->type == lrmd_event_connect && (safe_str_eq(cmd->action, "start") ||
 592                                            safe_str_eq(cmd->action, "migrate_from"))) {
 593 
 594         if (op->connection_rc < 0) {
 595             update_remaining_timeout(cmd);
 596 
 597             if (op->connection_rc == -ENOKEY) {
 598                 // Hard error, don't retry
 599                 cmd->op_status = PCMK_LRM_OP_ERROR;
 600                 cmd->rc = PCMK_OCF_INVALID_PARAM;
 601                 cmd->exit_reason = strdup("Authentication key not readable");
 602 
 603             } else if (cmd->remaining_timeout > 3000) {
 604                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 605                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 606                 return;
 607 
 608             } else {
 609                 crm_trace("can't reschedule start, remaining timeout too small %d",
 610                           cmd->remaining_timeout);
 611                 cmd->op_status = PCMK_LRM_OP_TIMEOUT;
 612                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 613             }
 614 
 615         } else {
 616             lrm_state_reset_tables(lrm_state, TRUE);
 617             cmd->rc = PCMK_OCF_OK;
 618             cmd->op_status = PCMK_LRM_OP_DONE;
 619             ra_data->active = TRUE;
 620         }
 621 
 622         crm_debug("remote lrmd connect event matched %s action. ", cmd->action);
 623         report_remote_ra_result(cmd);
 624         cmd_handled = TRUE;
 625 
 626     } else if (op->type == lrmd_event_poke && safe_str_eq(cmd->action, "monitor")) {
 627 
 628         if (cmd->monitor_timeout_id) {
 629             g_source_remove(cmd->monitor_timeout_id);
 630             cmd->monitor_timeout_id = 0;
 631         }
 632 
 633         /* Only report success the first time, after that only worry about failures.
 634          * For this function, if we get the poke pack, it is always a success. Pokes
 635          * only fail if the send fails, or the response times out. */
 636         if (!cmd->reported_success) {
 637             cmd->rc = PCMK_OCF_OK;
 638             cmd->op_status = PCMK_LRM_OP_DONE;
 639             report_remote_ra_result(cmd);
 640             cmd->reported_success = 1;
 641         }
 642 
 643         crm_debug("remote lrmd poke event matched %s action. ", cmd->action);
 644 
 645         /* success, keep rescheduling if interval is present. */
 646         if (cmd->interval && (cmd->cancel == FALSE)) {
 647             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 648             cmd->interval_id = g_timeout_add(cmd->interval, recurring_helper, cmd);
 649             cmd = NULL;         /* prevent free */
 650         }
 651         cmd_handled = TRUE;
 652 
 653     } else if (op->type == lrmd_event_disconnect && safe_str_eq(cmd->action, "monitor")) {
 654         if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
 655             cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 656             cmd->op_status = PCMK_LRM_OP_ERROR;
 657             report_remote_ra_result(cmd);
 658             crm_err("remote-node %s unexpectedly disconneced during monitor operation", lrm_state->node_name);
 659         }
 660         cmd_handled = TRUE;
 661 
 662     } else if (op->type == lrmd_event_new_client && safe_str_eq(cmd->action, "stop")) {
 663 
 664         handle_remote_ra_stop(lrm_state, cmd);
 665         cmd_handled = TRUE;
 666 
 667     } else {
 668         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 669     }
 670 
 671     if (cmd_handled) {
 672         ra_data->cur_cmd = NULL;
 673         if (ra_data->cmds) {
 674             mainloop_set_trigger(ra_data->work);
 675         }
 676         free_cmd(cmd);
 677     }
 678 }
 679 
 680 static void
 681 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 682 {
 683     remote_ra_data_t *ra_data = NULL;
 684 
 685     CRM_ASSERT(lrm_state);
 686     ra_data = lrm_state->remote_ra_data;
 687 
 688     if (ra_data->migrate_status != takeover_complete) {
 689         /* delete pending ops when ever the remote connection is intentionally stopped */
 690         g_hash_table_remove_all(lrm_state->pending_ops);
 691     } else {
 692         /* we no longer hold the history if this connection has been migrated,
 693          * however, we keep metadata cache for future use */
 694         lrm_state_reset_tables(lrm_state, FALSE);
 695     }
 696 
 697     ra_data->active = FALSE;
 698     lrm_state_disconnect(lrm_state);
 699 
 700     if (ra_data->cmds) {
 701         g_list_free_full(ra_data->cmds, free_cmd);
 702     }
 703     if (ra_data->recurring_cmds) {
 704         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 705     }
 706     ra_data->cmds = NULL;
 707     ra_data->recurring_cmds = NULL;
 708     ra_data->cur_cmd = NULL;
 709 
 710     if (cmd) {
 711         cmd->rc = PCMK_OCF_OK;
 712         cmd->op_status = PCMK_LRM_OP_DONE;
 713 
 714         report_remote_ra_result(cmd);
 715     }
 716 }
 717 
 718 static int
 719 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 720 {
 721     const char *server = NULL;
 722     lrmd_key_value_t *tmp = NULL;
 723     int port = 0;
 724     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 725 
 726     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 727         if (safe_str_eq(tmp->key, "addr") || safe_str_eq(tmp->key, "server")) {
 728             server = tmp->value;
 729         }
 730         if (safe_str_eq(tmp->key, "port")) {
 731             port = atoi(tmp->value);
 732         }
 733     }
 734 
 735     return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used);
 736 }
 737 
 738 static gboolean
 739 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 740 {
 741     int rc = 0;
 742     lrm_state_t *lrm_state = user_data;
 743     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 744     remote_ra_cmd_t *cmd;
 745     GList *first = NULL;
 746 
 747     if (ra_data->cur_cmd) {
 748         /* still waiting on previous cmd */
 749         return TRUE;
 750     }
 751 
 752     while (ra_data->cmds) {
 753         first = ra_data->cmds;
 754         cmd = first->data;
 755         if (cmd->delay_id) {
 756             /* still waiting for start delay timer to trip */
 757             return TRUE;
 758         }
 759 
 760         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 761         g_list_free_1(first);
 762 
 763         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
 764             ra_data->migrate_status = 0;
 765             rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout);
 766             if (rc == 0) {
 767                 /* take care of this later when we get async connection result */
 768                 crm_debug("began remote lrmd connect, waiting for connect event.");
 769                 ra_data->cur_cmd = cmd;
 770                 return TRUE;
 771             } else {
 772                 crm_debug("connect failed, not expecting to match any connection event later");
 773                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 774                 cmd->op_status = PCMK_LRM_OP_ERROR;
 775             }
 776             report_remote_ra_result(cmd);
 777 
 778         } else if (!strcmp(cmd->action, "monitor")) {
 779 
 780             if (lrm_state_is_connected(lrm_state) == TRUE) {
 781                 rc = lrm_state_poke_connection(lrm_state);
 782                 if (rc < 0) {
 783                     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 784                     cmd->op_status = PCMK_LRM_OP_ERROR;
 785                 }
 786             } else {
 787                 rc = -1;
 788                 cmd->op_status = PCMK_LRM_OP_DONE;
 789                 cmd->rc = PCMK_OCF_NOT_RUNNING;
 790             }
 791 
 792             if (rc == 0) {
 793                 crm_debug("poked remote lrmd at node %s, waiting for async response.", cmd->rsc_id);
 794                 ra_data->cur_cmd = cmd;
 795                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 796                 return TRUE;
 797             }
 798             report_remote_ra_result(cmd);
 799 
 800         } else if (!strcmp(cmd->action, "stop")) {
 801 
 802             if (ra_data->migrate_status == expect_takeover) {
 803                 /* briefly wait on stop for the takeover event to occur. If the
 804                  * takeover event does not occur during the wait period, that's fine.
 805                  * It just means that the remote-node's lrm_status section is going to get
 806                  * cleared which will require all the resources running in the remote-node
 807                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 808                  * successfully, then we can leave the status section intact. */
 809                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 810                 ra_data->cur_cmd = cmd;
 811                 return TRUE;
 812             }
 813 
 814             handle_remote_ra_stop(lrm_state, cmd);
 815 
 816         } else if (!strcmp(cmd->action, "migrate_to")) {
 817             ra_data->migrate_status = expect_takeover;
 818             cmd->rc = PCMK_OCF_OK;
 819             cmd->op_status = PCMK_LRM_OP_DONE;
 820             report_remote_ra_result(cmd);
 821         } else if (!strcmp(cmd->action, "reload")) {
 822             /* reloads are a no-op right now, add logic here when they become important */
 823             cmd->rc = PCMK_OCF_OK;
 824             cmd->op_status = PCMK_LRM_OP_DONE;
 825             report_remote_ra_result(cmd);
 826         }
 827 
 828         free_cmd(cmd);
 829     }
 830 
 831     return TRUE;
 832 }
 833 
 834 static void
 835 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 836 {
 837     remote_ra_data_t *ra_data = NULL;
 838 
 839     if (lrm_state->remote_ra_data) {
 840         return;
 841     }
 842 
 843     ra_data = calloc(1, sizeof(remote_ra_data_t));
 844     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
 845     lrm_state->remote_ra_data = ra_data;
 846 }
 847 
 848 void
 849 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 850 {
 851     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 852 
 853     if (!ra_data) {
 854         return;
 855     }
 856 
 857     if (ra_data->cmds) {
 858         g_list_free_full(ra_data->cmds, free_cmd);
 859     }
 860 
 861     if (ra_data->recurring_cmds) {
 862         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 863     }
 864     mainloop_destroy_trigger(ra_data->work);
 865     free(ra_data);
 866     lrm_state->remote_ra_data = NULL;
 867 }
 868 
 869 gboolean
 870 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
 871 {
 872     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
 873         return TRUE;
 874     }
 875     if (id && lrm_state_find(id) && safe_str_neq(id, fsa_our_uname)) {
 876         return TRUE;
 877     }
 878 
 879     return FALSE;
 880 }
 881 
 882 lrmd_rsc_info_t *
 883 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 884 {
 885     lrmd_rsc_info_t *info = NULL;
 886 
 887     if ((lrm_state_find(rsc_id))) {
 888         info = calloc(1, sizeof(lrmd_rsc_info_t));
 889 
 890         info->id = strdup(rsc_id);
 891         info->type = strdup(REMOTE_LRMD_RA);
 892         info->class = strdup(PCMK_RESOURCE_CLASS_OCF);
 893         info->provider = strdup("pacemaker");
 894     }
 895 
 896     return info;
 897 }
 898 
 899 static gboolean
 900 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 901 {
 902     if (!action) {
 903         return FALSE;
 904     } else if (strcmp(action, "start") &&
 905                strcmp(action, "stop") &&
 906                strcmp(action, "reload") &&
 907                strcmp(action, "migrate_to") &&
 908                strcmp(action, "migrate_from") && strcmp(action, "monitor")) {
 909         return FALSE;
 910     }
 911 
 912     return TRUE;
 913 }
 914 
 915 static GList *
 916 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
 917 {
 918     GList *rm_list = NULL;
 919     remote_ra_cmd_t *cmd = NULL;
 920     GListPtr gIter = NULL;
 921 
 922     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 923         cmd = gIter->data;
 924         if (cmd->interval > 0 && safe_str_eq(cmd->action, "monitor")) {
 925             rm_list = g_list_append(rm_list, cmd);
 926         }
 927     }
 928 
 929     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
 930         cmd = gIter->data;
 931 
 932         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 933         cmd->op_status = PCMK_LRM_OP_ERROR;
 934         crm_trace("Pre-emptively failing %s %s (interval=%d, %s)", cmd->action, cmd->rsc_id, cmd->interval, cmd->userdata);
 935         report_remote_ra_result(cmd);
 936 
 937         list = g_list_remove(list, cmd);
 938         free_cmd(cmd);
 939     }
 940 
 941     /* frees only the list data, not the cmds */
 942     g_list_free(rm_list);
 943     return list;
 944 }
 945 
 946 static GList *
 947 remove_cmd(GList * list, const char *action, int interval)
     /* [previous][next][first][last][top][bottom][index][help] */
 948 {
 949     remote_ra_cmd_t *cmd = NULL;
 950     GListPtr gIter = NULL;
 951 
 952     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 953         cmd = gIter->data;
 954         if (cmd->interval == interval && safe_str_eq(cmd->action, action)) {
 955             break;
 956         }
 957         cmd = NULL;
 958     }
 959     if (cmd) {
 960         list = g_list_remove(list, cmd);
 961         free_cmd(cmd);
 962     }
 963     return list;
 964 }
 965 
 966 int
 967 remote_ra_cancel(lrm_state_t * lrm_state, const char *rsc_id, const char *action, int interval)
     /* [previous][next][first][last][top][bottom][index][help] */
 968 {
 969     lrm_state_t *connection_rsc = NULL;
 970     remote_ra_data_t *ra_data = NULL;
 971 
 972     connection_rsc = lrm_state_find(rsc_id);
 973     if (!connection_rsc || !connection_rsc->remote_ra_data) {
 974         return -EINVAL;
 975     }
 976 
 977     ra_data = connection_rsc->remote_ra_data;
 978     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval);
 979     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action, interval);
 980     if (ra_data->cur_cmd &&
 981         (ra_data->cur_cmd->interval == interval) &&
 982         (safe_str_eq(ra_data->cur_cmd->action, action))) {
 983 
 984         ra_data->cur_cmd->cancel = TRUE;
 985     }
 986 
 987     return 0;
 988 }
 989 
 990 static remote_ra_cmd_t *
 991 handle_dup_monitor(remote_ra_data_t *ra_data, int interval, const char *userdata)
     /* [previous][next][first][last][top][bottom][index][help] */
 992 {
 993     GList *gIter = NULL;
 994     remote_ra_cmd_t *cmd = NULL;
 995 
 996     /* there are 3 places a potential duplicate monitor operation
 997      * could exist.
 998      * 1. recurring_cmds list. where the op is waiting for its next interval
 999      * 2. cmds list, where the op is queued to get executed immediately
1000      * 3. cur_cmd, which means the monitor op is in flight right now.
1001      */
1002     if (interval == 0) {
1003         return NULL;
1004     }
1005 
1006     if (ra_data->cur_cmd &&
1007         ra_data->cur_cmd->cancel == FALSE &&
1008         ra_data->cur_cmd->interval == interval &&
1009         safe_str_eq(ra_data->cur_cmd->action, "monitor")) {
1010 
1011         cmd = ra_data->cur_cmd;
1012         goto handle_dup;
1013     }
1014 
1015     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1016         cmd = gIter->data;
1017         if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) {
1018             goto handle_dup;
1019         }
1020     }
1021 
1022     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1023         cmd = gIter->data;
1024         if (cmd->interval == interval && safe_str_eq(cmd->action, "monitor")) {
1025             goto handle_dup;
1026         }
1027     }
1028 
1029     return NULL;
1030 
1031 handle_dup:
1032 
1033     crm_trace("merging duplicate monitor cmd %s_monitor_%d", cmd->rsc_id, interval);
1034 
1035     /* update the userdata */
1036     if (userdata) {
1037        free(cmd->userdata);
1038        cmd->userdata = strdup(userdata);
1039     }
1040 
1041     /* if we've already reported success, generate a new call id */
1042     if (cmd->reported_success) {
1043         cmd->start_time = time(NULL);
1044         cmd->call_id = generate_callid();
1045         cmd->reported_success = 0;
1046     }
1047 
1048     /* if we have an interval_id set, that means we are in the process of
1049      * waiting for this cmd's next interval. instead of waiting, cancel
1050      * the timer and execute the action immediately */
1051     if (cmd->interval_id) {
1052         g_source_remove(cmd->interval_id);
1053         cmd->interval_id = 0;
1054         recurring_helper(cmd);
1055     }
1056 
1057     return cmd;  
1058 }
1059 
1060 int
1061 remote_ra_exec(lrm_state_t * lrm_state, const char *rsc_id, const char *action, const char *userdata, int interval,     /* ms */
     /* [previous][next][first][last][top][bottom][index][help] */
1062                int timeout,     /* ms */
1063                int start_delay, /* ms */
1064                lrmd_key_value_t * params)
1065 {
1066     int rc = 0;
1067     lrm_state_t *connection_rsc = NULL;
1068     remote_ra_cmd_t *cmd = NULL;
1069     remote_ra_data_t *ra_data = NULL;
1070 
1071     if (is_remote_ra_supported_action(action) == FALSE) {
1072         rc = -EINVAL;
1073         goto exec_done;
1074     }
1075 
1076     connection_rsc = lrm_state_find(rsc_id);
1077     if (!connection_rsc) {
1078         rc = -EINVAL;
1079         goto exec_done;
1080     }
1081 
1082     remote_ra_data_init(connection_rsc);
1083     ra_data = connection_rsc->remote_ra_data;
1084 
1085     cmd = handle_dup_monitor(ra_data, interval, userdata);
1086     if (cmd) {
1087        return cmd->call_id;
1088     }
1089 
1090     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1091     cmd->owner = strdup(lrm_state->node_name);
1092     cmd->rsc_id = strdup(rsc_id);
1093     cmd->action = strdup(action);
1094     cmd->userdata = strdup(userdata);
1095     cmd->interval = interval;
1096     cmd->timeout = timeout;
1097     cmd->start_delay = start_delay;
1098     cmd->params = params;
1099     cmd->start_time = time(NULL);
1100 
1101     cmd->call_id = generate_callid();
1102 
1103     if (cmd->start_delay) {
1104         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1105     }
1106 
1107     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1108     mainloop_set_trigger(ra_data->work);
1109 
1110     return cmd->call_id;
1111   exec_done:
1112 
1113     lrmd_key_value_freeall(params);
1114     return rc;
1115 }
1116 
1117 /*!
1118  * \internal
1119  * \brief Immediately fail all monitors of a remote node, if proxied here
1120  *
1121  * \param[in] node_name  Name of pacemaker_remote node
1122  */
1123 void
1124 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1125 {
1126     lrm_state_t *lrm_state = lrm_state_find(node_name);
1127 
1128     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1129         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1130 
1131         crm_info("Failing monitors on pacemaker_remote node %s", node_name);
1132         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1133         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1134     }
1135 }
1136 
1137 /* A guest node fencing implied by host fencing looks like:
1138  *
1139  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1140  *                on_node="lxc1" on_node_uuid="lxc1">
1141  *     <attributes CRM_meta_master_lxc_ms="10" CRM_meta_on_node="lxc1"
1142  *                 CRM_meta_on_node_uuid="lxc1" CRM_meta_stonith_action="off"
1143  *                 crm_feature_set="3.0.12"/>
1144  *     <downed>
1145  *       <node id="lxc1"/>
1146  *     </downed>
1147  *  </pseudo_event>
1148  */
1149 #define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1150     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1151     "/" XML_CIB_TAG_NODE
1152 
1153 /*!
1154  * \internal
1155  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1156  *
1157  * \param[in] xml  XML of pseudo-action to check
1158  */
1159 void
1160 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1161 {
1162     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1163 
1164     if (numXpathResults(search) == 1) {
1165         xmlNode *result = getXpathResult(search, 0);
1166 
1167         /* Normally, we handle the necessary side effects of a guest node stop
1168          * action when reporting the remote agent's result. However, if the stop
1169          * is implied due to fencing, it will be a fencing pseudo-event, and
1170          * there won't be a result to report. Handle that case here.
1171          *
1172          * This will result in a duplicate call to remote_node_down() if the
1173          * guest stop was real instead of implied, but that shouldn't hurt.
1174          *
1175          * There is still one corner case that isn't handled: if a guest node
1176          * isn't running any resources when its host is fenced, it will appear
1177          * to be cleanly stopped, so there will be no pseudo-fence, and our
1178          * peer cache state will be incorrect unless and until the guest is
1179          * recovered.
1180          */
1181         if (result) {
1182             const char *remote = ID(result);
1183 
1184             if (remote) {
1185                 remote_node_down(remote, DOWN_ERASE_LRM);
1186             }
1187         }
1188     }
1189     freeXpathObject(search);
1190 }
1191 
1192 static void
1193 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1194 {
1195     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1196     xmlNode *update, *state;
1197     int call_opt, call_id = 0;
1198     crm_node_t *node;
1199 
1200     call_opt = crmd_cib_smart_opt();
1201     node = crm_remote_peer_get(lrm_state->node_name);
1202     CRM_CHECK(node != NULL, return);
1203     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1204     state = create_node_state_update(node, node_update_none, update,
1205                                      __FUNCTION__);
1206     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1207     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
1208     if (call_id < 0) {
1209         crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
1210     } else {
1211         /* TODO: still not 100% sure that async update will succeed ... */
1212         ra_data->is_maintenance = maintenance;
1213     }
1214     free_xml(update);
1215 }
1216 
1217 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1218     "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1219     XML_GRAPH_TAG_MAINTENANCE
1220 
1221 /*!
1222  * \internal
1223  * \brief Check a pseudo-action holding updates for maintenance state
1224  *
1225  * \param[in] xml  XML of pseudo-action to check
1226  */
1227 
1228 void
1229 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1230 {
1231     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1232 
1233     if (numXpathResults(search) == 1) {
1234         xmlNode *node;
1235         int cnt = 0, cnt_remote = 0;
1236 
1237         for (node =
1238                 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1239             node; node = __xml_next(node)) {
1240             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1241 
1242             cnt++;
1243             if (lrm_state && lrm_state->remote_ra_data &&
1244                 ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
1245                 cnt_remote++;
1246                 remote_ra_maintenance(lrm_state,
1247                                         crm_atoi(crm_element_value(node,
1248                                             XML_NODE_IS_MAINTENANCE), "0"));
1249 
1250             }
1251         }
1252         crm_trace("Action holds %d nodes (%d remotes found) "
1253                     "adjusting maintenance-mode", cnt, cnt_remote);
1254     }
1255     freeXpathObject(search);
1256 }
1257 
1258 gboolean
1259 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1260 {
1261     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1262 
1263     return ra_data->is_maintenance;
1264 }

/* [previous][next][first][last][top][bottom][index][help] */