root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. remote_node_up
  6. remote_node_down
  7. check_remote_node_state
  8. report_remote_ra_result
  9. update_remaining_timeout
  10. retry_start_cmd_cb
  11. connection_takeover_timeout_cb
  12. monitor_timeout_cb
  13. synthesize_lrmd_success
  14. remote_lrm_op_callback
  15. handle_remote_ra_stop
  16. handle_remote_ra_start
  17. handle_remote_ra_exec
  18. remote_ra_data_init
  19. remote_ra_cleanup
  20. is_remote_lrmd_ra
  21. remote_ra_get_rsc_info
  22. is_remote_ra_supported_action
  23. fail_all_monitor_cmds
  24. remove_cmd
  25. remote_ra_cancel
  26. handle_dup_monitor
  27. remote_ra_exec
  28. remote_ra_fail
  29. remote_ra_process_pseudo
  30. remote_ra_maintenance
  31. remote_ra_process_maintenance_nodes
  32. remote_ra_is_in_maintenance
  33. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2021 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/msg_xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/services.h>
  17 
  18 #include <pacemaker-controld.h>
  19 
  20 #define REMOTE_LRMD_RA "remote"
  21 
  22 /* The max start timeout before cmd retry */
  23 #define MAX_START_TIMEOUT_MS 10000
  24 
  25 typedef struct remote_ra_cmd_s {
  26     /*! the local node the cmd is issued from */
  27     char *owner;
  28     /*! the remote node the cmd is executed on */
  29     char *rsc_id;
  30     /*! the action to execute */
  31     char *action;
  32     /*! some string the client wants us to give it back */
  33     char *userdata;
  34     char *exit_reason;          // descriptive text on error
  35     /*! start delay in ms */
  36     int start_delay;
  37     /*! timer id used for start delay. */
  38     int delay_id;
  39     /*! timeout in ms for cmd */
  40     int timeout;
  41     int remaining_timeout;
  42     /*! recurring interval in ms */
  43     guint interval_ms;
  44     /*! interval timer id */
  45     int interval_id;
  46     int reported_success;
  47     int monitor_timeout_id;
  48     int takeover_timeout_id;
  49     /*! action parameters */
  50     lrmd_key_value_t *params;
  51     /*! executed rc */
  52     int rc;
  53     int op_status;
  54     int call_id;
  55     time_t start_time;
  56     gboolean cancel;
  57 } remote_ra_cmd_t;
  58 
  59 enum remote_migration_status {
  60     expect_takeover = 1,
  61     takeover_complete,
  62 };
  63 
  64 typedef struct remote_ra_data_s {
  65     crm_trigger_t *work;
  66     remote_ra_cmd_t *cur_cmd;
  67     GList *cmds;
  68     GList *recurring_cmds;
  69 
  70     enum remote_migration_status migrate_status;
  71 
  72     gboolean active;
  73 
  74     /* Maintenance mode is difficult to determine from the controller's context,
  75      * so we have it signalled back with the transition from the scheduler.
  76      */
  77     gboolean is_maintenance;
  78 
  79     /* Similar for whether we are controlling a guest node or remote node.
  80      * Fortunately there is a meta-attribute in the transition already and
  81      * as the situation doesn't change over time we can use the
  82      * resource start for noting down the information for later use when
  83      * the attributes aren't at hand.
  84      */
  85     gboolean controlling_guest;
  86 } remote_ra_data_t;
  87 
  88 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
  89 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
  90 static GList *fail_all_monitor_cmds(GList * list);
  91 
  92 static void
  93 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
  94 {
  95     remote_ra_cmd_t *cmd = user_data;
  96 
  97     if (!cmd) {
  98         return;
  99     }
 100     if (cmd->delay_id) {
 101         g_source_remove(cmd->delay_id);
 102     }
 103     if (cmd->interval_id) {
 104         g_source_remove(cmd->interval_id);
 105     }
 106     if (cmd->monitor_timeout_id) {
 107         g_source_remove(cmd->monitor_timeout_id);
 108     }
 109     if (cmd->takeover_timeout_id) {
 110         g_source_remove(cmd->takeover_timeout_id);
 111     }
 112     free(cmd->owner);
 113     free(cmd->rsc_id);
 114     free(cmd->action);
 115     free(cmd->userdata);
 116     free(cmd->exit_reason);
 117     lrmd_key_value_freeall(cmd->params);
 118     free(cmd);
 119 }
 120 
 121 static int
 122 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 123 {
 124     static int remote_ra_callid = 0;
 125 
 126     remote_ra_callid++;
 127     if (remote_ra_callid <= 0) {
 128         remote_ra_callid = 1;
 129     }
 130 
 131     return remote_ra_callid;
 132 }
 133 
 134 static gboolean
 135 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 136 {
 137     remote_ra_cmd_t *cmd = data;
 138     lrm_state_t *connection_rsc = NULL;
 139 
 140     cmd->interval_id = 0;
 141     connection_rsc = lrm_state_find(cmd->rsc_id);
 142     if (connection_rsc && connection_rsc->remote_ra_data) {
 143         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 144 
 145         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 146 
 147         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 148         mainloop_set_trigger(ra_data->work);
 149     }
 150     return FALSE;
 151 }
 152 
 153 static gboolean
 154 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 155 {
 156     remote_ra_cmd_t *cmd = data;
 157     lrm_state_t *connection_rsc = NULL;
 158 
 159     cmd->delay_id = 0;
 160     connection_rsc = lrm_state_find(cmd->rsc_id);
 161     if (connection_rsc && connection_rsc->remote_ra_data) {
 162         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 163 
 164         mainloop_set_trigger(ra_data->work);
 165     }
 166     return FALSE;
 167 }
 168 
 169 /*!
 170  * \internal
 171  * \brief Handle cluster communication related to pacemaker_remote node joining
 172  *
 173  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 174  */
 175 static void
 176 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 177 {
 178     int call_opt, call_id = 0;
 179     xmlNode *update, *state;
 180     crm_node_t *node;
 181     enum controld_section_e section = controld_section_all;
 182 
 183     CRM_CHECK(node_name != NULL, return);
 184     crm_info("Announcing pacemaker_remote node %s", node_name);
 185 
 186     /* Clear node's entire state (resource history and transient attributes)
 187      * other than shutdown locks. The transient attributes should and normally
 188      * will be cleared when the node leaves, but since remote node state has a
 189      * number of corner cases, clear them here as well, to be sure.
 190      */
 191     call_opt = crmd_cib_smart_opt();
 192     if (controld_shutdown_lock_enabled) {
 193         section = controld_section_all_unlocked;
 194     }
 195     /* Purge node from attrd's memory */
 196     update_attrd_remote_node_removed(node_name, NULL);
 197 
 198     controld_delete_node_state(node_name, section, call_opt);
 199 
 200     /* Clear node's probed attribute */
 201     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 202 
 203     /* Ensure node is in the remote peer cache with member status */
 204     node = crm_remote_peer_get(node_name);
 205     CRM_CHECK(node != NULL, return);
 206     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 207 
 208     /* pacemaker_remote nodes don't participate in the membership layer,
 209      * so cluster nodes don't automatically get notified when they come and go.
 210      * We send a cluster message to the DC, and update the CIB node state entry,
 211      * so the DC will get it sooner (via message) or later (via CIB refresh),
 212      * and any other interested parties can query the CIB.
 213      */
 214     send_remote_state_message(node_name, TRUE);
 215 
 216     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 217     state = create_node_state_update(node, node_update_cluster, update,
 218                                      __func__);
 219 
 220     /* Clear the XML_NODE_IS_FENCED flag in the node state. If the node ever
 221      * needs to be fenced, this flag will allow various actions to determine
 222      * whether the fencing has happened yet.
 223      */
 224     crm_xml_add(state, XML_NODE_IS_FENCED, "0");
 225 
 226     /* TODO: If the remote connection drops, and this (async) CIB update either
 227      * failed or has not yet completed, later actions could mistakenly think the
 228      * node has already been fenced (if the XML_NODE_IS_FENCED attribute was
 229      * previously set, because it won't have been cleared). This could prevent
 230      * actual fencing or allow recurring monitor failures to be cleared too
 231      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 232      */
 233     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 234     if (call_id < 0) {
 235         crm_perror(LOG_WARNING, "%s CIB node state setup", node_name);
 236     }
 237     free_xml(update);
 238 }
 239 
 240 enum down_opts {
 241     DOWN_KEEP_LRM,
 242     DOWN_ERASE_LRM
 243 };
 244 
 245 /*!
 246  * \internal
 247  * \brief Handle cluster communication related to pacemaker_remote node leaving
 248  *
 249  * \param[in] node_name  Name of lost node
 250  * \param[in] opts       Whether to keep or erase LRM history
 251  */
 252 static void
 253 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 254 {
 255     xmlNode *update;
 256     int call_id = 0;
 257     int call_opt = crmd_cib_smart_opt();
 258     crm_node_t *node;
 259 
 260     /* Purge node from attrd's memory */
 261     update_attrd_remote_node_removed(node_name, NULL);
 262 
 263     /* Normally, only node attributes should be erased, and the resource history
 264      * should be kept until the node comes back up. However, after a successful
 265      * fence, we want to clear the history as well, so we don't think resources
 266      * are still running on the node.
 267      */
 268     if (opts == DOWN_ERASE_LRM) {
 269         controld_delete_node_state(node_name, controld_section_all, call_opt);
 270     } else {
 271         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 272     }
 273 
 274     /* Ensure node is in the remote peer cache with lost state */
 275     node = crm_remote_peer_get(node_name);
 276     CRM_CHECK(node != NULL, return);
 277     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 278 
 279     /* Notify DC */
 280     send_remote_state_message(node_name, FALSE);
 281 
 282     /* Update CIB node state */
 283     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
 284     create_node_state_update(node, node_update_cluster, update, __func__);
 285     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
 286     if (call_id < 0) {
 287         crm_perror(LOG_ERR, "%s CIB node state update", node_name);
 288     }
 289     free_xml(update);
 290 }
 291 
 292 /*!
 293  * \internal
 294  * \brief Handle effects of a remote RA command on node state
 295  *
 296  * \param[in] cmd  Completed remote RA command
 297  */
 298 static void
 299 check_remote_node_state(remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 300 {
 301     /* Only successful actions can change node state */
 302     if (cmd->rc != PCMK_OCF_OK) {
 303         return;
 304     }
 305 
 306     if (pcmk__str_eq(cmd->action, "start", pcmk__str_casei)) {
 307         remote_node_up(cmd->rsc_id);
 308 
 309     } else if (pcmk__str_eq(cmd->action, "migrate_from", pcmk__str_casei)) {
 310         /* After a successful migration, we don't need to do remote_node_up()
 311          * because the DC already knows the node is up, and we don't want to
 312          * clear LRM history etc. We do need to add the remote node to this
 313          * host's remote peer cache, because (unless it happens to be DC)
 314          * it hasn't been tracking the remote node, and other code relies on
 315          * the cache to distinguish remote nodes from unseen cluster nodes.
 316          */
 317         crm_node_t *node = crm_remote_peer_get(cmd->rsc_id);
 318 
 319         CRM_CHECK(node != NULL, return);
 320         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 321 
 322     } else if (pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 323         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 324         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 325 
 326         if (ra_data) {
 327             if (ra_data->migrate_status != takeover_complete) {
 328                 /* Stop means down if we didn't successfully migrate elsewhere */
 329                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 330             } else if (AM_I_DC == FALSE) {
 331                 /* Only the connection host and DC track node state,
 332                  * so if the connection migrated elsewhere and we aren't DC,
 333                  * un-cache the node, so we don't have stale info
 334                  */
 335                 crm_remote_peer_cache_remove(cmd->rsc_id);
 336             }
 337         }
 338     }
 339 
 340     /* We don't do anything for successful monitors, which is correct for
 341      * routine recurring monitors, and for monitors on nodes where the
 342      * connection isn't supposed to be (the cluster will stop the connection in
 343      * that case). However, if the initial probe finds the connection already
 344      * active on the node where we want it, we probably should do
 345      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 346      * Given that connections have to be initiated by the cluster, the chance of
 347      * that should be close to zero.
 348      */
 349 }
 350 
 351 static void
 352 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 353 {
 354     lrmd_event_data_t op = { 0, };
 355 
 356     check_remote_node_state(cmd);
 357 
 358     op.type = lrmd_event_exec_complete;
 359     op.rsc_id = cmd->rsc_id;
 360     op.op_type = cmd->action;
 361     op.user_data = cmd->userdata;
 362     op.exit_reason = cmd->exit_reason;
 363     op.timeout = cmd->timeout;
 364     op.interval_ms = cmd->interval_ms;
 365     op.rc = cmd->rc;
 366     op.op_status = cmd->op_status;
 367     op.t_run = (unsigned int) cmd->start_time;
 368     op.t_rcchange = (unsigned int) cmd->start_time;
 369     if (cmd->reported_success && cmd->rc != PCMK_OCF_OK) {
 370         op.t_rcchange = (unsigned int) time(NULL);
 371         /* This edge case will likely never ever occur, but if it does the
 372          * result is that a failure will not be processed correctly. This is only
 373          * remotely possible because we are able to detect a connection resource's tcp
 374          * connection has failed at any moment after start has completed. The actual
 375          * recurring operation is just a connectivity ping.
 376          *
 377          * basically, we are not guaranteed that the first successful monitor op and
 378          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 379          * make it look like the operations occurred at separate times though. */
 380         if (op.t_rcchange == op.t_run) {
 381             op.t_rcchange++;
 382         }
 383     }
 384 
 385     if (cmd->params) {
 386         lrmd_key_value_t *tmp;
 387 
 388         op.params = pcmk__strkey_table(free, free);
 389         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 390             g_hash_table_insert(op.params, strdup(tmp->key), strdup(tmp->value));
 391         }
 392 
 393     }
 394     op.call_id = cmd->call_id;
 395     op.remote_nodename = cmd->owner;
 396 
 397     lrm_op_callback(&op);
 398 
 399     if (op.params) {
 400         g_hash_table_destroy(op.params);
 401     }
 402 }
 403 
 404 static void
 405 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 406 {
 407     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 408 }
 409 
 410 static gboolean
 411 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 412 {
 413     lrm_state_t *lrm_state = data;
 414     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 415     remote_ra_cmd_t *cmd = NULL;
 416     int rc = -1;
 417 
 418     if (!ra_data || !ra_data->cur_cmd) {
 419         return FALSE;
 420     }
 421     cmd = ra_data->cur_cmd;
 422     if (!pcmk__strcase_any_of(cmd->action, "start", "migrate_from", NULL)) {
 423         return FALSE;
 424     }
 425     update_remaining_timeout(cmd);
 426 
 427     if (cmd->remaining_timeout > 0) {
 428         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 429     }
 430 
 431     if (rc != 0) {
 432         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 433         cmd->op_status = PCMK_LRM_OP_ERROR;
 434         report_remote_ra_result(cmd);
 435 
 436         if (ra_data->cmds) {
 437             mainloop_set_trigger(ra_data->work);
 438         }
 439         ra_data->cur_cmd = NULL;
 440         free_cmd(cmd);
 441     } else {
 442         /* wait for connection event */
 443     }
 444 
 445     return FALSE;
 446 }
 447 
 448 
 449 static gboolean
 450 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 451 {
 452     lrm_state_t *lrm_state = NULL;
 453     remote_ra_cmd_t *cmd = data;
 454 
 455     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 456     cmd->takeover_timeout_id = 0;
 457 
 458     lrm_state = lrm_state_find(cmd->rsc_id);
 459 
 460     handle_remote_ra_stop(lrm_state, cmd);
 461     free_cmd(cmd);
 462 
 463     return FALSE;
 464 }
 465 
 466 static gboolean
 467 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 468 {
 469     lrm_state_t *lrm_state = NULL;
 470     remote_ra_cmd_t *cmd = data;
 471 
 472     lrm_state = lrm_state_find(cmd->rsc_id);
 473 
 474     crm_info("Timed out waiting for remote poke response from %s%s",
 475              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 476     cmd->monitor_timeout_id = 0;
 477     cmd->op_status = PCMK_LRM_OP_TIMEOUT;
 478     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 479 
 480     if (lrm_state && lrm_state->remote_ra_data) {
 481         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 482 
 483         if (ra_data->cur_cmd == cmd) {
 484             ra_data->cur_cmd = NULL;
 485         }
 486         if (ra_data->cmds) {
 487             mainloop_set_trigger(ra_data->work);
 488         }
 489     }
 490 
 491     report_remote_ra_result(cmd);
 492     free_cmd(cmd);
 493 
 494     if(lrm_state) {
 495         lrm_state_disconnect(lrm_state);
 496     }
 497     return FALSE;
 498 }
 499 
 500 static void
 501 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 502 {
 503     lrmd_event_data_t op = { 0, };
 504 
 505     if (lrm_state == NULL) {
 506         /* if lrm_state not given assume local */
 507         lrm_state = lrm_state_find(fsa_our_uname);
 508     }
 509     CRM_ASSERT(lrm_state != NULL);
 510 
 511     op.type = lrmd_event_exec_complete;
 512     op.rsc_id = rsc_id;
 513     op.op_type = op_type;
 514     op.rc = PCMK_OCF_OK;
 515     op.op_status = PCMK_LRM_OP_DONE;
 516     op.t_run = (unsigned int) time(NULL);
 517     op.t_rcchange = op.t_run;
 518     op.call_id = generate_callid();
 519     process_lrm_event(lrm_state, &op, NULL, NULL);
 520 }
 521 
 522 void
 523 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 524 {
 525     gboolean cmd_handled = FALSE;
 526     lrm_state_t *lrm_state = NULL;
 527     remote_ra_data_t *ra_data = NULL;
 528     remote_ra_cmd_t *cmd = NULL;
 529 
 530     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 531               "(%d) status=%s (%d)",
 532               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 533               lrmd_event_type2str(op->type), op->remote_nodename,
 534               services_ocf_exitcode_str(op->rc), op->rc,
 535               services_lrm_status_str(op->op_status), op->op_status);
 536 
 537     lrm_state = lrm_state_find(op->remote_nodename);
 538     if (!lrm_state || !lrm_state->remote_ra_data) {
 539         crm_debug("No state information found for remote connection event");
 540         return;
 541     }
 542     ra_data = lrm_state->remote_ra_data;
 543 
 544     if (op->type == lrmd_event_new_client) {
 545         // Another client has connected to the remote daemon
 546 
 547         if (ra_data->migrate_status == expect_takeover) {
 548             // Great, we knew this was coming
 549             ra_data->migrate_status = takeover_complete;
 550 
 551         } else {
 552             crm_err("Unexpected pacemaker_remote client takeover for %s. Disconnecting", op->remote_nodename);
 553             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 554             /* Do not free lrm_state->conn yet. */
 555             /* It'll be freed in the following stop action. */
 556             lrm_state_disconnect_only(lrm_state);
 557         }
 558         return;
 559     }
 560 
 561     /* filter all EXEC events up */
 562     if (op->type == lrmd_event_exec_complete) {
 563         if (ra_data->migrate_status == takeover_complete) {
 564             crm_debug("ignoring event, this connection is taken over by another node");
 565         } else {
 566             lrm_op_callback(op);
 567         }
 568         return;
 569     }
 570 
 571     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 572 
 573         if (ra_data->active == FALSE) {
 574             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 575                       lrm_state->node_name);
 576 
 577         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 578             crm_err("Lost connection to Pacemaker Remote node %s",
 579                     lrm_state->node_name);
 580             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 581             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 582 
 583         } else {
 584             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 585                        lrm_state->node_name);
 586             /* Do roughly what a 'stop' on the remote-resource would do */
 587             handle_remote_ra_stop(lrm_state, NULL);
 588             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 589             /* now fake the reply of a successful 'stop' */
 590             synthesize_lrmd_success(NULL, lrm_state->node_name, "stop");
 591         }
 592         return;
 593     }
 594 
 595     if (!ra_data->cur_cmd) {
 596         crm_debug("no event to match");
 597         return;
 598     }
 599 
 600     cmd = ra_data->cur_cmd;
 601 
 602     /* Start actions and migrate from actions complete after connection
 603      * comes back to us. */
 604     if (op->type == lrmd_event_connect && pcmk__strcase_any_of(cmd->action, "start",
 605                                                                "migrate_from", NULL)) {
 606         if (op->connection_rc < 0) {
 607             update_remaining_timeout(cmd);
 608 
 609             if (op->connection_rc == -ENOKEY) {
 610                 // Hard error, don't retry
 611                 cmd->op_status = PCMK_LRM_OP_ERROR;
 612                 cmd->rc = PCMK_OCF_INVALID_PARAM;
 613                 cmd->exit_reason = strdup("Authentication key not readable");
 614 
 615             } else if (cmd->remaining_timeout > 3000) {
 616                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 617                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 618                 return;
 619 
 620             } else {
 621                 crm_trace("can't reschedule start, remaining timeout too small %d",
 622                           cmd->remaining_timeout);
 623                 cmd->op_status = PCMK_LRM_OP_TIMEOUT;
 624                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 625             }
 626 
 627         } else {
 628             lrm_state_reset_tables(lrm_state, TRUE);
 629             cmd->rc = PCMK_OCF_OK;
 630             cmd->op_status = PCMK_LRM_OP_DONE;
 631             ra_data->active = TRUE;
 632         }
 633 
 634         crm_debug("Remote connection event matched %s action", cmd->action);
 635         report_remote_ra_result(cmd);
 636         cmd_handled = TRUE;
 637 
 638     } else if (op->type == lrmd_event_poke && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 639 
 640         if (cmd->monitor_timeout_id) {
 641             g_source_remove(cmd->monitor_timeout_id);
 642             cmd->monitor_timeout_id = 0;
 643         }
 644 
 645         /* Only report success the first time, after that only worry about failures.
 646          * For this function, if we get the poke pack, it is always a success. Pokes
 647          * only fail if the send fails, or the response times out. */
 648         if (!cmd->reported_success) {
 649             cmd->rc = PCMK_OCF_OK;
 650             cmd->op_status = PCMK_LRM_OP_DONE;
 651             report_remote_ra_result(cmd);
 652             cmd->reported_success = 1;
 653         }
 654 
 655         crm_debug("Remote poke event matched %s action", cmd->action);
 656 
 657         /* success, keep rescheduling if interval is present. */
 658         if (cmd->interval_ms && (cmd->cancel == FALSE)) {
 659             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 660             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 661                                              recurring_helper, cmd);
 662             cmd = NULL;         /* prevent free */
 663         }
 664         cmd_handled = TRUE;
 665 
 666     } else if (op->type == lrmd_event_disconnect && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 667         if (ra_data->active == TRUE && (cmd->cancel == FALSE)) {
 668             cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 669             cmd->op_status = PCMK_LRM_OP_ERROR;
 670             report_remote_ra_result(cmd);
 671             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 672                     lrm_state->node_name);
 673         }
 674         cmd_handled = TRUE;
 675 
 676     } else if (op->type == lrmd_event_new_client && pcmk__str_eq(cmd->action, "stop", pcmk__str_casei)) {
 677 
 678         handle_remote_ra_stop(lrm_state, cmd);
 679         cmd_handled = TRUE;
 680 
 681     } else {
 682         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 683     }
 684 
 685     if (cmd_handled) {
 686         ra_data->cur_cmd = NULL;
 687         if (ra_data->cmds) {
 688             mainloop_set_trigger(ra_data->work);
 689         }
 690         free_cmd(cmd);
 691     }
 692 }
 693 
 694 static void
 695 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 696 {
 697     remote_ra_data_t *ra_data = NULL;
 698 
 699     CRM_ASSERT(lrm_state);
 700     ra_data = lrm_state->remote_ra_data;
 701 
 702     if (ra_data->migrate_status != takeover_complete) {
 703         /* delete pending ops when ever the remote connection is intentionally stopped */
 704         g_hash_table_remove_all(lrm_state->pending_ops);
 705     } else {
 706         /* we no longer hold the history if this connection has been migrated,
 707          * however, we keep metadata cache for future use */
 708         lrm_state_reset_tables(lrm_state, FALSE);
 709     }
 710 
 711     ra_data->active = FALSE;
 712     lrm_state_disconnect(lrm_state);
 713 
 714     if (ra_data->cmds) {
 715         g_list_free_full(ra_data->cmds, free_cmd);
 716     }
 717     if (ra_data->recurring_cmds) {
 718         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 719     }
 720     ra_data->cmds = NULL;
 721     ra_data->recurring_cmds = NULL;
 722     ra_data->cur_cmd = NULL;
 723 
 724     if (cmd) {
 725         cmd->rc = PCMK_OCF_OK;
 726         cmd->op_status = PCMK_LRM_OP_DONE;
 727 
 728         report_remote_ra_result(cmd);
 729     }
 730 }
 731 
 732 static int
 733 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 734 {
 735     const char *server = NULL;
 736     lrmd_key_value_t *tmp = NULL;
 737     int port = 0;
 738     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 739     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 740 
 741     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 742         if (pcmk__strcase_any_of(tmp->key, XML_RSC_ATTR_REMOTE_RA_ADDR,
 743                                  XML_RSC_ATTR_REMOTE_RA_SERVER, NULL)) {
 744             server = tmp->value;
 745         } else if (pcmk__str_eq(tmp->key, XML_RSC_ATTR_REMOTE_RA_PORT, pcmk__str_casei)) {
 746             port = atoi(tmp->value);
 747         } else if (pcmk__str_eq(tmp->key, CRM_META "_" XML_RSC_ATTR_CONTAINER, pcmk__str_casei)) {
 748             ra_data->controlling_guest = TRUE;
 749         }
 750     }
 751 
 752     return lrm_state_remote_connect_async(lrm_state, server, port, timeout_used);
 753 }
 754 
 755 static gboolean
 756 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 757 {
 758     int rc = 0;
 759     lrm_state_t *lrm_state = user_data;
 760     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 761     remote_ra_cmd_t *cmd;
 762     GList *first = NULL;
 763 
 764     if (ra_data->cur_cmd) {
 765         /* still waiting on previous cmd */
 766         return TRUE;
 767     }
 768 
 769     while (ra_data->cmds) {
 770         first = ra_data->cmds;
 771         cmd = first->data;
 772         if (cmd->delay_id) {
 773             /* still waiting for start delay timer to trip */
 774             return TRUE;
 775         }
 776 
 777         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 778         g_list_free_1(first);
 779 
 780         if (!strcmp(cmd->action, "start") || !strcmp(cmd->action, "migrate_from")) {
 781             ra_data->migrate_status = 0;
 782             rc = handle_remote_ra_start(lrm_state, cmd, cmd->timeout);
 783             if (rc == 0) {
 784                 /* take care of this later when we get async connection result */
 785                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 786                           cmd->action);
 787                 ra_data->cur_cmd = cmd;
 788                 return TRUE;
 789             } else {
 790                 crm_debug("Could not initiate remote connection for %s action",
 791                           cmd->action);
 792                 cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 793                 cmd->op_status = PCMK_LRM_OP_ERROR;
 794             }
 795             report_remote_ra_result(cmd);
 796 
 797         } else if (!strcmp(cmd->action, "monitor")) {
 798 
 799             if (lrm_state_is_connected(lrm_state) == TRUE) {
 800                 rc = lrm_state_poke_connection(lrm_state);
 801                 if (rc < 0) {
 802                     cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 803                     cmd->op_status = PCMK_LRM_OP_ERROR;
 804                 }
 805             } else {
 806                 rc = -1;
 807                 cmd->op_status = PCMK_LRM_OP_DONE;
 808                 cmd->rc = PCMK_OCF_NOT_RUNNING;
 809             }
 810 
 811             if (rc == 0) {
 812                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 813                           cmd->rsc_id);
 814                 ra_data->cur_cmd = cmd;
 815                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 816                 return TRUE;
 817             }
 818             report_remote_ra_result(cmd);
 819 
 820         } else if (!strcmp(cmd->action, "stop")) {
 821 
 822             if (ra_data->migrate_status == expect_takeover) {
 823                 /* briefly wait on stop for the takeover event to occur. If the
 824                  * takeover event does not occur during the wait period, that's fine.
 825                  * It just means that the remote-node's lrm_status section is going to get
 826                  * cleared which will require all the resources running in the remote-node
 827                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 828                  * successfully, then we can leave the status section intact. */
 829                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 830                 ra_data->cur_cmd = cmd;
 831                 return TRUE;
 832             }
 833 
 834             handle_remote_ra_stop(lrm_state, cmd);
 835 
 836         } else if (!strcmp(cmd->action, "migrate_to")) {
 837             ra_data->migrate_status = expect_takeover;
 838             cmd->rc = PCMK_OCF_OK;
 839             cmd->op_status = PCMK_LRM_OP_DONE;
 840             report_remote_ra_result(cmd);
 841         } else if (pcmk__str_any_of(cmd->action, CRMD_ACTION_RELOAD,
 842                                     CRMD_ACTION_RELOAD_AGENT, NULL))  {
 843             /* Currently the only reloadable parameter is reconnect_interval,
 844              * which is only used by the scheduler via the CIB, so reloads are a
 845              * no-op.
 846              *
 847              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 848              * we're in a rolling upgrade with a DC scheduling "reload" instead
 849              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 850              * so this would work for that purpose as well.
 851              */
 852             cmd->rc = PCMK_OCF_OK;
 853             cmd->op_status = PCMK_LRM_OP_DONE;
 854             report_remote_ra_result(cmd);
 855         }
 856 
 857         free_cmd(cmd);
 858     }
 859 
 860     return TRUE;
 861 }
 862 
 863 static void
 864 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 865 {
 866     remote_ra_data_t *ra_data = NULL;
 867 
 868     if (lrm_state->remote_ra_data) {
 869         return;
 870     }
 871 
 872     ra_data = calloc(1, sizeof(remote_ra_data_t));
 873     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
 874     lrm_state->remote_ra_data = ra_data;
 875 }
 876 
 877 void
 878 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 879 {
 880     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 881 
 882     if (!ra_data) {
 883         return;
 884     }
 885 
 886     if (ra_data->cmds) {
 887         g_list_free_full(ra_data->cmds, free_cmd);
 888     }
 889 
 890     if (ra_data->recurring_cmds) {
 891         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 892     }
 893     mainloop_destroy_trigger(ra_data->work);
 894     free(ra_data);
 895     lrm_state->remote_ra_data = NULL;
 896 }
 897 
 898 gboolean
 899 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
 900 {
 901     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
 902         return TRUE;
 903     }
 904     if (id && lrm_state_find(id) && !pcmk__str_eq(id, fsa_our_uname, pcmk__str_casei)) {
 905         return TRUE;
 906     }
 907 
 908     return FALSE;
 909 }
 910 
 911 lrmd_rsc_info_t *
 912 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
 913 {
 914     lrmd_rsc_info_t *info = NULL;
 915 
 916     if ((lrm_state_find(rsc_id))) {
 917         info = calloc(1, sizeof(lrmd_rsc_info_t));
 918 
 919         info->id = strdup(rsc_id);
 920         info->type = strdup(REMOTE_LRMD_RA);
 921         info->standard = strdup(PCMK_RESOURCE_CLASS_OCF);
 922         info->provider = strdup("pacemaker");
 923     }
 924 
 925     return info;
 926 }
 927 
 928 static gboolean
 929 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
 930 {
 931     return pcmk__str_any_of(action,
 932                             CRMD_ACTION_START,
 933                             CRMD_ACTION_STOP,
 934                             CRMD_ACTION_STATUS,
 935                             CRMD_ACTION_MIGRATE,
 936                             CRMD_ACTION_MIGRATED,
 937                             CRMD_ACTION_RELOAD_AGENT,
 938                             CRMD_ACTION_RELOAD,
 939                             NULL);
 940 }
 941 
 942 static GList *
 943 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
 944 {
 945     GList *rm_list = NULL;
 946     remote_ra_cmd_t *cmd = NULL;
 947     GList *gIter = NULL;
 948 
 949     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 950         cmd = gIter->data;
 951         if ((cmd->interval_ms > 0) && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
 952             rm_list = g_list_append(rm_list, cmd);
 953         }
 954     }
 955 
 956     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
 957         cmd = gIter->data;
 958 
 959         cmd->rc = PCMK_OCF_UNKNOWN_ERROR;
 960         cmd->op_status = PCMK_LRM_OP_ERROR;
 961         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
 962                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
 963         report_remote_ra_result(cmd);
 964 
 965         list = g_list_remove(list, cmd);
 966         free_cmd(cmd);
 967     }
 968 
 969     /* frees only the list data, not the cmds */
 970     g_list_free(rm_list);
 971     return list;
 972 }
 973 
 974 static GList *
 975 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 976 {
 977     remote_ra_cmd_t *cmd = NULL;
 978     GList *gIter = NULL;
 979 
 980     for (gIter = list; gIter != NULL; gIter = gIter->next) {
 981         cmd = gIter->data;
 982         if ((cmd->interval_ms == interval_ms)
 983             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
 984             break;
 985         }
 986         cmd = NULL;
 987     }
 988     if (cmd) {
 989         list = g_list_remove(list, cmd);
 990         free_cmd(cmd);
 991     }
 992     return list;
 993 }
 994 
 995 int
 996 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
 997                  const char *action, guint interval_ms)
 998 {
 999     lrm_state_t *connection_rsc = NULL;
1000     remote_ra_data_t *ra_data = NULL;
1001 
1002     connection_rsc = lrm_state_find(rsc_id);
1003     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1004         return -EINVAL;
1005     }
1006 
1007     ra_data = connection_rsc->remote_ra_data;
1008     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1009     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1010                                          interval_ms);
1011     if (ra_data->cur_cmd &&
1012         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1013         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1014 
1015         ra_data->cur_cmd->cancel = TRUE;
1016     }
1017 
1018     return 0;
1019 }
1020 
1021 static remote_ra_cmd_t *
1022 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1023                    const char *userdata)
1024 {
1025     GList *gIter = NULL;
1026     remote_ra_cmd_t *cmd = NULL;
1027 
1028     /* there are 3 places a potential duplicate monitor operation
1029      * could exist.
1030      * 1. recurring_cmds list. where the op is waiting for its next interval
1031      * 2. cmds list, where the op is queued to get executed immediately
1032      * 3. cur_cmd, which means the monitor op is in flight right now.
1033      */
1034     if (interval_ms == 0) {
1035         return NULL;
1036     }
1037 
1038     if (ra_data->cur_cmd &&
1039         ra_data->cur_cmd->cancel == FALSE &&
1040         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1041         pcmk__str_eq(ra_data->cur_cmd->action, "monitor", pcmk__str_casei)) {
1042 
1043         cmd = ra_data->cur_cmd;
1044         goto handle_dup;
1045     }
1046 
1047     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1048         cmd = gIter->data;
1049         if ((cmd->interval_ms == interval_ms)
1050             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1051             goto handle_dup;
1052         }
1053     }
1054 
1055     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1056         cmd = gIter->data;
1057         if ((cmd->interval_ms == interval_ms)
1058             && pcmk__str_eq(cmd->action, "monitor", pcmk__str_casei)) {
1059             goto handle_dup;
1060         }
1061     }
1062 
1063     return NULL;
1064 
1065 handle_dup:
1066 
1067     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1068               cmd->rsc_id, "monitor", interval_ms);
1069 
1070     /* update the userdata */
1071     if (userdata) {
1072        free(cmd->userdata);
1073        cmd->userdata = strdup(userdata);
1074     }
1075 
1076     /* if we've already reported success, generate a new call id */
1077     if (cmd->reported_success) {
1078         cmd->start_time = time(NULL);
1079         cmd->call_id = generate_callid();
1080         cmd->reported_success = 0;
1081     }
1082 
1083     /* if we have an interval_id set, that means we are in the process of
1084      * waiting for this cmd's next interval. instead of waiting, cancel
1085      * the timer and execute the action immediately */
1086     if (cmd->interval_id) {
1087         g_source_remove(cmd->interval_id);
1088         cmd->interval_id = 0;
1089         recurring_helper(cmd);
1090     }
1091 
1092     return cmd;
1093 }
1094 
1095 int
1096 remote_ra_exec(lrm_state_t *lrm_state, const char *rsc_id, const char *action,
     /* [previous][next][first][last][top][bottom][index][help] */
1097                const char *userdata, guint interval_ms,
1098                int timeout,     /* ms */
1099                int start_delay, /* ms */
1100                lrmd_key_value_t * params)
1101 {
1102     int rc = 0;
1103     lrm_state_t *connection_rsc = NULL;
1104     remote_ra_cmd_t *cmd = NULL;
1105     remote_ra_data_t *ra_data = NULL;
1106 
1107     if (is_remote_ra_supported_action(action) == FALSE) {
1108         rc = -EINVAL;
1109         goto exec_done;
1110     }
1111 
1112     connection_rsc = lrm_state_find(rsc_id);
1113     if (!connection_rsc) {
1114         rc = -EINVAL;
1115         goto exec_done;
1116     }
1117 
1118     remote_ra_data_init(connection_rsc);
1119     ra_data = connection_rsc->remote_ra_data;
1120 
1121     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1122     if (cmd) {
1123         rc = cmd->call_id;
1124         goto exec_done;
1125     }
1126 
1127     cmd = calloc(1, sizeof(remote_ra_cmd_t));
1128     cmd->owner = strdup(lrm_state->node_name);
1129     cmd->rsc_id = strdup(rsc_id);
1130     cmd->action = strdup(action);
1131     cmd->userdata = strdup(userdata);
1132     cmd->interval_ms = interval_ms;
1133     cmd->timeout = timeout;
1134     cmd->start_delay = start_delay;
1135     cmd->params = params;
1136     cmd->start_time = time(NULL);
1137 
1138     cmd->call_id = generate_callid();
1139 
1140     if (cmd->start_delay) {
1141         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1142     }
1143 
1144     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1145     mainloop_set_trigger(ra_data->work);
1146 
1147     return cmd->call_id;
1148   exec_done:
1149 
1150     lrmd_key_value_freeall(params);
1151     return rc;
1152 }
1153 
1154 /*!
1155  * \internal
1156  * \brief Immediately fail all monitors of a remote node, if proxied here
1157  *
1158  * \param[in] node_name  Name of pacemaker_remote node
1159  */
1160 void
1161 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1162 {
1163     lrm_state_t *lrm_state = lrm_state_find(node_name);
1164 
1165     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1166         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1167 
1168         crm_info("Failing monitors on pacemaker_remote node %s", node_name);
1169         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1170         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1171     }
1172 }
1173 
1174 /* A guest node fencing implied by host fencing looks like:
1175  *
1176  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1177  *                on_node="lxc1" on_node_uuid="lxc1">
1178  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1179  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1180  *     <downed>
1181  *       <node id="lxc1"/>
1182  *     </downed>
1183  *  </pseudo_event>
1184  */
1185 #define XPATH_PSEUDO_FENCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1186     "[@" XML_LRM_ATTR_TASK "='stonith']/" XML_GRAPH_TAG_DOWNED \
1187     "/" XML_CIB_TAG_NODE
1188 
1189 /*!
1190  * \internal
1191  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1192  *
1193  * \param[in] xml  XML of pseudo-action to check
1194  */
1195 void
1196 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1197 {
1198     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1199 
1200     if (numXpathResults(search) == 1) {
1201         xmlNode *result = getXpathResult(search, 0);
1202 
1203         /* Normally, we handle the necessary side effects of a guest node stop
1204          * action when reporting the remote agent's result. However, if the stop
1205          * is implied due to fencing, it will be a fencing pseudo-event, and
1206          * there won't be a result to report. Handle that case here.
1207          *
1208          * This will result in a duplicate call to remote_node_down() if the
1209          * guest stop was real instead of implied, but that shouldn't hurt.
1210          *
1211          * There is still one corner case that isn't handled: if a guest node
1212          * isn't running any resources when its host is fenced, it will appear
1213          * to be cleanly stopped, so there will be no pseudo-fence, and our
1214          * peer cache state will be incorrect unless and until the guest is
1215          * recovered.
1216          */
1217         if (result) {
1218             const char *remote = ID(result);
1219 
1220             if (remote) {
1221                 remote_node_down(remote, DOWN_ERASE_LRM);
1222             }
1223         }
1224     }
1225     freeXpathObject(search);
1226 }
1227 
1228 static void
1229 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1230 {
1231     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1232     xmlNode *update, *state;
1233     int call_opt, call_id = 0;
1234     crm_node_t *node;
1235 
1236     call_opt = crmd_cib_smart_opt();
1237     node = crm_remote_peer_get(lrm_state->node_name);
1238     CRM_CHECK(node != NULL, return);
1239     update = create_xml_node(NULL, XML_CIB_TAG_STATUS);
1240     state = create_node_state_update(node, node_update_none, update,
1241                                      __func__);
1242     crm_xml_add(state, XML_NODE_IS_MAINTENANCE, maintenance?"1":"0");
1243     fsa_cib_update(XML_CIB_TAG_STATUS, update, call_opt, call_id, NULL);
1244     if (call_id < 0) {
1245         crm_perror(LOG_WARNING, "%s CIB node state update failed", lrm_state->node_name);
1246     } else {
1247         /* TODO: still not 100% sure that async update will succeed ... */
1248         ra_data->is_maintenance = maintenance;
1249     }
1250     free_xml(update);
1251 }
1252 
1253 #define XPATH_PSEUDO_MAINTENANCE "//" XML_GRAPH_TAG_PSEUDO_EVENT \
1254     "[@" XML_LRM_ATTR_TASK "='" CRM_OP_MAINTENANCE_NODES "']/" \
1255     XML_GRAPH_TAG_MAINTENANCE
1256 
1257 /*!
1258  * \internal
1259  * \brief Check a pseudo-action holding updates for maintenance state
1260  *
1261  * \param[in] xml  XML of pseudo-action to check
1262  */
1263 
1264 void
1265 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1266 {
1267     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1268 
1269     if (numXpathResults(search) == 1) {
1270         xmlNode *node;
1271         int cnt = 0, cnt_remote = 0;
1272 
1273         for (node =
1274                 first_named_child(getXpathResult(search, 0), XML_CIB_TAG_NODE);
1275             node != NULL; node = pcmk__xml_next(node)) {
1276             lrm_state_t *lrm_state = lrm_state_find(ID(node));
1277 
1278             cnt++;
1279             if (lrm_state && lrm_state->remote_ra_data &&
1280                 ((remote_ra_data_t *) lrm_state->remote_ra_data)->active) {
1281                 int is_maint;
1282 
1283                 cnt_remote++;
1284                 pcmk__scan_min_int(crm_element_value(node, XML_NODE_IS_MAINTENANCE),
1285                                    &is_maint, 0);
1286                 remote_ra_maintenance(lrm_state, is_maint);
1287             }
1288         }
1289         crm_trace("Action holds %d nodes (%d remotes found) "
1290                     "adjusting maintenance-mode", cnt, cnt_remote);
1291     }
1292     freeXpathObject(search);
1293 }
1294 
1295 gboolean
1296 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1297 {
1298     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1299 
1300     return ra_data->is_maintenance;
1301 }
1302 
1303 gboolean
1304 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1305 {
1306     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1307 
1308     return ra_data->controlling_guest;
1309 }

/* [previous][next][first][last][top][bottom][index][help] */