root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. should_purge_attributes
  6. section_to_delete
  7. purge_remote_node_attrs
  8. remote_node_up
  9. remote_node_down
  10. check_remote_node_state
  11. report_remote_ra_result
  12. remaining_timeout_sec
  13. retry_start_cmd_cb
  14. connection_takeover_timeout_cb
  15. monitor_timeout_cb
  16. synthesize_lrmd_success
  17. remote_lrm_op_callback
  18. handle_remote_ra_stop
  19. handle_remote_ra_start
  20. handle_remote_ra_exec
  21. remote_ra_data_init
  22. remote_ra_cleanup
  23. is_remote_lrmd_ra
  24. remote_ra_get_rsc_info
  25. is_remote_ra_supported_action
  26. fail_all_monitor_cmds
  27. remove_cmd
  28. remote_ra_cancel
  29. handle_dup_monitor
  30. controld_execute_remote_agent
  31. remote_ra_fail
  32. remote_ra_process_pseudo
  33. remote_ra_maintenance
  34. remote_ra_process_maintenance_nodes
  35. remote_ra_is_in_maintenance
  36. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2025 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <libxml/xpath.h>               // xmlXPathObject, etc.
  20 
  21 #include <pacemaker-controld.h>
  22 
  23 #define REMOTE_LRMD_RA "remote"
  24 
  25 /* The max start timeout before cmd retry */
  26 #define MAX_START_TIMEOUT_MS 10000
  27 
  28 #define cmd_set_flags(cmd, flags_to_set) do { \
  29     (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
  30                                        "Remote command", (cmd)->rsc_id, (cmd)->status, \
  31                                        (flags_to_set), #flags_to_set); \
  32         } while (0)
  33 
  34 #define cmd_clear_flags(cmd, flags_to_clear) do { \
  35     (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
  36                                          "Remote command", (cmd)->rsc_id, (cmd)->status, \
  37                                          (flags_to_clear), #flags_to_clear); \
  38         } while (0)
  39 
  40 enum remote_cmd_status {
  41     cmd_reported_success    = (1 << 0),
  42     cmd_cancel              = (1 << 1),
  43 };
  44 
  45 typedef struct remote_ra_cmd_s {
  46     /*! the local node the cmd is issued from */
  47     char *owner;
  48     /*! the remote node the cmd is executed on */
  49     char *rsc_id;
  50     /*! the action to execute */
  51     char *action;
  52     /*! some string the client wants us to give it back */
  53     char *userdata;
  54     /*! start delay in ms */
  55     int start_delay;
  56     /*! timer id used for start delay. */
  57     int delay_id;
  58     /*! timeout in ms for cmd */
  59     int timeout;
  60     /*! recurring interval in ms */
  61     guint interval_ms;
  62     /*! interval timer id */
  63     int interval_id;
  64     int monitor_timeout_id;
  65     int takeover_timeout_id;
  66     /*! action parameters */
  67     lrmd_key_value_t *params;
  68     pcmk__action_result_t result;
  69     int call_id;
  70     time_t start_time;
  71     uint32_t status;
  72 } remote_ra_cmd_t;
  73 
  74 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
  75     lrm_state_t *lrm = (lrm_state); \
  76     remote_ra_data_t *ra = lrm->remote_ra_data; \
  77     ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  78                                     lrm->node_name, ra->status, \
  79                                     (flags_to_set), #flags_to_set); \
  80         } while (0)
  81 
  82 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
  83     lrm_state_t *lrm = (lrm_state); \
  84     remote_ra_data_t *ra = lrm->remote_ra_data; \
  85     ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  86                                       lrm->node_name, ra->status, \
  87                                       (flags_to_clear), #flags_to_clear); \
  88         } while (0)
  89 
  90 enum remote_status {
  91     expect_takeover     = (1 << 0),
  92     takeover_complete   = (1 << 1),
  93     remote_active       = (1 << 2),
  94     /* Maintenance mode is difficult to determine from the controller's context,
  95      * so we have it signalled back with the transition from the scheduler.
  96      */
  97     remote_in_maint     = (1 << 3),
  98     /* Similar for whether we are controlling a guest node or remote node.
  99      * Fortunately there is a meta-attribute in the transition already and
 100      * as the situation doesn't change over time we can use the
 101      * resource start for noting down the information for later use when
 102      * the attributes aren't at hand.
 103      */
 104     controlling_guest   = (1 << 4),
 105 };
 106 
 107 typedef struct remote_ra_data_s {
 108     crm_trigger_t *work;
 109     remote_ra_cmd_t *cur_cmd;
 110     GList *cmds;
 111     GList *recurring_cmds;
 112     uint32_t status;
 113 } remote_ra_data_t;
 114 
 115 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 116 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 117 static GList *fail_all_monitor_cmds(GList * list);
 118 
 119 static void
 120 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 121 {
 122     remote_ra_cmd_t *cmd = user_data;
 123 
 124     if (!cmd) {
 125         return;
 126     }
 127     if (cmd->delay_id) {
 128         g_source_remove(cmd->delay_id);
 129     }
 130     if (cmd->interval_id) {
 131         g_source_remove(cmd->interval_id);
 132     }
 133     if (cmd->monitor_timeout_id) {
 134         g_source_remove(cmd->monitor_timeout_id);
 135     }
 136     if (cmd->takeover_timeout_id) {
 137         g_source_remove(cmd->takeover_timeout_id);
 138     }
 139     free(cmd->owner);
 140     free(cmd->rsc_id);
 141     free(cmd->action);
 142     free(cmd->userdata);
 143     pcmk__reset_result(&(cmd->result));
 144     lrmd_key_value_freeall(cmd->params);
 145     free(cmd);
 146 }
 147 
 148 static int
 149 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 150 {
 151     static int remote_ra_callid = 0;
 152 
 153     remote_ra_callid++;
 154     if (remote_ra_callid <= 0) {
 155         remote_ra_callid = 1;
 156     }
 157 
 158     return remote_ra_callid;
 159 }
 160 
 161 static gboolean
 162 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 163 {
 164     remote_ra_cmd_t *cmd = data;
 165     lrm_state_t *connection_rsc = NULL;
 166 
 167     cmd->interval_id = 0;
 168     connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
 169     if (connection_rsc && connection_rsc->remote_ra_data) {
 170         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 171 
 172         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 173 
 174         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 175         mainloop_set_trigger(ra_data->work);
 176     }
 177     return FALSE;
 178 }
 179 
 180 static gboolean
 181 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 182 {
 183     remote_ra_cmd_t *cmd = data;
 184     lrm_state_t *connection_rsc = NULL;
 185 
 186     cmd->delay_id = 0;
 187     connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
 188     if (connection_rsc && connection_rsc->remote_ra_data) {
 189         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 190 
 191         mainloop_set_trigger(ra_data->work);
 192     }
 193     return FALSE;
 194 }
 195 
 196 static bool
 197 should_purge_attributes(pcmk__node_status_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 198 {
 199     pcmk__node_status_t *conn_node = NULL;
 200     lrm_state_t *connection_rsc = NULL;
 201 
 202     if ((node->conn_host == NULL) || (node->name == NULL)) {
 203         return true;
 204     }
 205 
 206     /* Get the node that was hosting the remote connection resource from the
 207      * peer cache.  That's the one we really care about here.
 208      */
 209     conn_node = pcmk__get_node(0, node->conn_host, NULL,
 210                                pcmk__node_search_cluster_member);
 211     if (conn_node == NULL) {
 212         return true;
 213     }
 214 
 215     /* Check the uptime of connection_rsc.  If it hasn't been running long
 216      * enough, set purge=true.  "Long enough" means it started running earlier
 217      * than the timestamp when we noticed it went away in the first place.
 218      */
 219     connection_rsc = controld_get_executor_state(node->name, false);
 220 
 221     if (connection_rsc != NULL) {
 222         lrmd_t *lrm = connection_rsc->conn;
 223         time_t uptime = lrmd__uptime(lrm);
 224         time_t now = time(NULL);
 225 
 226         /* Add 20s of fuzziness to give corosync a while to notice the remote
 227          * host is gone.  On various error conditions (failure to get uptime,
 228          * peer_lost isn't set) we default to purging.
 229          */
 230         if (uptime > 0 &&
 231             conn_node->peer_lost > 0 &&
 232             uptime + 20 >= now - conn_node->peer_lost) {
 233             return false;
 234         }
 235     }
 236 
 237     return true;
 238 }
 239 
 240 static enum controld_section_e
 241 section_to_delete(bool purge)
     /* [previous][next][first][last][top][bottom][index][help] */
 242 {
 243     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 244         if (purge) {
 245             return controld_section_all_unlocked;
 246         } else {
 247             return controld_section_lrm_unlocked;
 248         }
 249     } else {
 250         if (purge) {
 251             return controld_section_all;
 252         } else {
 253             return controld_section_lrm;
 254         }
 255     }
 256 }
 257 
 258 static void
 259 purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 260 {
 261     bool purge = should_purge_attributes(node);
 262     enum controld_section_e section = section_to_delete(purge);
 263 
 264     /* Purge node from attrd's memory */
 265     if (purge) {
 266         update_attrd_remote_node_removed(node->name, NULL);
 267     }
 268 
 269     controld_delete_node_state(node->name, section, call_opt);
 270 }
 271 
 272 /*!
 273  * \internal
 274  * \brief Handle cluster communication related to pacemaker_remote node joining
 275  *
 276  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 277  */
 278 static void
 279 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 280 {
 281     int call_opt;
 282     xmlNode *update, *state;
 283     pcmk__node_status_t *node = NULL;
 284     lrm_state_t *connection_rsc = NULL;
 285 
 286     CRM_CHECK(node_name != NULL, return);
 287     crm_info("Announcing Pacemaker Remote node %s", node_name);
 288 
 289     call_opt = crmd_cib_smart_opt();
 290 
 291     /* Delete node's CRM_OP_PROBED attribute. Deleting any attribute ensures
 292      * that the attribute manager learns the node is remote. Deletion of this
 293      * specfic attribute is a holdover from when it had special meaning.
 294      *
 295      * @COMPAT Find another way to tell attrd that the node is remote, without
 296      * risking deletion or overwrite of an arbitrary attribute. Then work on
 297      * deprecating CRM_OP_PROBED.
 298      */
 299     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 300 
 301     /* Ensure node is in the remote peer cache with member status */
 302     node = pcmk__cluster_lookup_remote_node(node_name);
 303     CRM_CHECK((node != NULL) && (node->name != NULL), return);
 304 
 305     purge_remote_node_attrs(call_opt, node);
 306     pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
 307 
 308     /* Apply any start state that we were given from the environment on the
 309      * remote node.
 310      */
 311     connection_rsc = controld_get_executor_state(node->name, false);
 312 
 313     if (connection_rsc != NULL) {
 314         lrmd_t *lrm = connection_rsc->conn;
 315         const char *start_state = lrmd__node_start_state(lrm);
 316 
 317         if (start_state) {
 318             set_join_state(start_state, node->name, node->xml_id, true);
 319         }
 320     }
 321 
 322     /* pacemaker_remote nodes don't participate in the membership layer,
 323      * so cluster nodes don't automatically get notified when they come and go.
 324      * We send a cluster message to the DC, and update the CIB node state entry,
 325      * so the DC will get it sooner (via message) or later (via CIB refresh),
 326      * and any other interested parties can query the CIB.
 327      */
 328     broadcast_remote_state_message(node_name, true);
 329 
 330     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 331     state = create_node_state_update(node, controld_node_update_cluster, update,
 332                                      __func__);
 333 
 334     /* Clear the PCMK__XA_NODE_FENCED flag in the node state. If the node ever
 335      * needs to be fenced, this flag will allow various actions to determine
 336      * whether the fencing has happened yet.
 337      */
 338     crm_xml_add(state, PCMK__XA_NODE_FENCED, "0");
 339 
 340     /* TODO: If the remote connection drops, and this (async) CIB update either
 341      * failed or has not yet completed, later actions could mistakenly think the
 342      * node has already been fenced (if the PCMK__XA_NODE_FENCED attribute was
 343      * previously set, because it won't have been cleared). This could prevent
 344      * actual fencing or allow recurring monitor failures to be cleared too
 345      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 346      */
 347     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 348     pcmk__xml_free(update);
 349 }
 350 
 351 enum down_opts {
 352     DOWN_KEEP_LRM,
 353     DOWN_ERASE_LRM
 354 };
 355 
 356 /*!
 357  * \internal
 358  * \brief Handle cluster communication related to pacemaker_remote node leaving
 359  *
 360  * \param[in] node_name  Name of lost node
 361  * \param[in] opts       Whether to keep or erase LRM history
 362  */
 363 static void
 364 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 365 {
 366     xmlNode *update;
 367     int call_opt = crmd_cib_smart_opt();
 368     pcmk__node_status_t *node = NULL;
 369 
 370     /* Purge node from attrd's memory */
 371     update_attrd_remote_node_removed(node_name, NULL);
 372 
 373     /* Normally, only node attributes should be erased, and the resource history
 374      * should be kept until the node comes back up. However, after a successful
 375      * fence, we want to clear the history as well, so we don't think resources
 376      * are still running on the node.
 377      */
 378     if (opts == DOWN_ERASE_LRM) {
 379         controld_delete_node_state(node_name, controld_section_all, call_opt);
 380     } else {
 381         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 382     }
 383 
 384     /* Ensure node is in the remote peer cache with lost state */
 385     node = pcmk__cluster_lookup_remote_node(node_name);
 386     CRM_CHECK(node != NULL, return);
 387     pcmk__update_peer_state(__func__, node, PCMK__VALUE_LOST, 0);
 388 
 389     /* Notify DC */
 390     broadcast_remote_state_message(node_name, false);
 391 
 392     /* Update CIB node state */
 393     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 394     create_node_state_update(node, controld_node_update_cluster, update,
 395                              __func__);
 396     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 397     pcmk__xml_free(update);
 398 }
 399 
 400 /*!
 401  * \internal
 402  * \brief Handle effects of a remote RA command on node state
 403  *
 404  * \param[in] cmd  Completed remote RA command
 405  */
 406 static void
 407 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 408 {
 409     /* Only successful actions can change node state */
 410     if (!pcmk__result_ok(&(cmd->result))) {
 411         return;
 412     }
 413 
 414     if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
 415         remote_node_up(cmd->rsc_id);
 416 
 417     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
 418                             pcmk__str_casei)) {
 419         /* After a successful migration, we don't need to do remote_node_up()
 420          * because the DC already knows the node is up, and we don't want to
 421          * clear LRM history etc. We do need to add the remote node to this
 422          * host's remote peer cache, because (unless it happens to be DC)
 423          * it hasn't been tracking the remote node, and other code relies on
 424          * the cache to distinguish remote nodes from unseen cluster nodes.
 425          */
 426         pcmk__node_status_t *node =
 427             pcmk__cluster_lookup_remote_node(cmd->rsc_id);
 428 
 429         CRM_CHECK(node != NULL, return);
 430         pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
 431 
 432     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
 433         lrm_state_t *lrm_state = controld_get_executor_state(cmd->rsc_id,
 434                                                              false);
 435         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 436 
 437         if (ra_data) {
 438             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 439                 /* Stop means down if we didn't successfully migrate elsewhere */
 440                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 441             } else if (AM_I_DC == FALSE) {
 442                 /* Only the connection host and DC track node state,
 443                  * so if the connection migrated elsewhere and we aren't DC,
 444                  * un-cache the node, so we don't have stale info
 445                  */
 446                 pcmk__cluster_forget_remote_node(cmd->rsc_id);
 447             }
 448         }
 449     }
 450 
 451     /* We don't do anything for successful monitors, which is correct for
 452      * routine recurring monitors, and for monitors on nodes where the
 453      * connection isn't supposed to be (the cluster will stop the connection in
 454      * that case). However, if the initial probe finds the connection already
 455      * active on the node where we want it, we probably should do
 456      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 457      * Given that connections have to be initiated by the cluster, the chance of
 458      * that should be close to zero.
 459      */
 460 }
 461 
 462 static void
 463 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 464 {
 465     lrmd_event_data_t op = { 0, };
 466 
 467     check_remote_node_state(cmd);
 468 
 469     op.type = lrmd_event_exec_complete;
 470     op.rsc_id = cmd->rsc_id;
 471     op.op_type = cmd->action;
 472     op.user_data = cmd->userdata;
 473     op.timeout = cmd->timeout;
 474     op.interval_ms = cmd->interval_ms;
 475     op.t_run = cmd->start_time;
 476     op.t_rcchange = cmd->start_time;
 477 
 478     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 479                      cmd->result.exit_reason);
 480 
 481     if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
 482         op.t_rcchange = time(NULL);
 483         /* This edge case will likely never ever occur, but if it does the
 484          * result is that a failure will not be processed correctly. This is only
 485          * remotely possible because we are able to detect a connection resource's tcp
 486          * connection has failed at any moment after start has completed. The actual
 487          * recurring operation is just a connectivity ping.
 488          *
 489          * basically, we are not guaranteed that the first successful monitor op and
 490          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 491          * make it look like the operations occurred at separate times though. */
 492         if (op.t_rcchange == op.t_run) {
 493             op.t_rcchange++;
 494         }
 495     }
 496 
 497     if (cmd->params) {
 498         lrmd_key_value_t *tmp;
 499 
 500         op.params = pcmk__strkey_table(free, free);
 501         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 502             pcmk__insert_dup(op.params, tmp->key, tmp->value);
 503         }
 504 
 505     }
 506     op.call_id = cmd->call_id;
 507     op.remote_nodename = cmd->owner;
 508 
 509     lrm_op_callback(&op);
 510 
 511     if (op.params) {
 512         g_hash_table_destroy(op.params);
 513     }
 514     lrmd__reset_result(&op);
 515 }
 516 
 517 /*!
 518  * \internal
 519  * \brief Return a remote command's remaining timeout in seconds
 520  *
 521  * \param[in] cmd  Remote command to check
 522  *
 523  * \return Command's remaining timeout in seconds
 524  */
 525 static int
 526 remaining_timeout_sec(const remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 527 {
 528     return pcmk__timeout_ms2s(cmd->timeout) - (time(NULL) - cmd->start_time);
 529 }
 530 
 531 static gboolean
 532 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 533 {
 534     lrm_state_t *lrm_state = data;
 535     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 536     remote_ra_cmd_t *cmd = NULL;
 537     int rc = ETIME;
 538     int remaining = 0;
 539 
 540     if (!ra_data || !ra_data->cur_cmd) {
 541         return FALSE;
 542     }
 543     cmd = ra_data->cur_cmd;
 544     if (!pcmk__is_up_action(cmd->action)) {
 545         return FALSE;
 546     }
 547 
 548     remaining = remaining_timeout_sec(cmd);
 549     if (remaining > 0) {
 550         rc = handle_remote_ra_start(lrm_state, cmd, remaining * 1000);
 551     } else {
 552         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 553                          PCMK_EXEC_TIMEOUT,
 554                          "Not enough time remains to retry remote connection");
 555     }
 556 
 557     if (rc != pcmk_rc_ok) {
 558         report_remote_ra_result(cmd);
 559 
 560         if (ra_data->cmds) {
 561             mainloop_set_trigger(ra_data->work);
 562         }
 563         ra_data->cur_cmd = NULL;
 564         free_cmd(cmd);
 565     } else {
 566         /* wait for connection event */
 567     }
 568 
 569     return FALSE;
 570 }
 571 
 572 
 573 static gboolean
 574 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 575 {
 576     lrm_state_t *lrm_state = NULL;
 577     remote_ra_cmd_t *cmd = data;
 578 
 579     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 580     cmd->takeover_timeout_id = 0;
 581 
 582     lrm_state = controld_get_executor_state(cmd->rsc_id, false);
 583 
 584     handle_remote_ra_stop(lrm_state, cmd);
 585     free_cmd(cmd);
 586 
 587     return FALSE;
 588 }
 589 
 590 static gboolean
 591 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 592 {
 593     lrm_state_t *lrm_state = NULL;
 594     remote_ra_cmd_t *cmd = data;
 595 
 596     lrm_state = controld_get_executor_state(cmd->rsc_id, false);
 597 
 598     crm_info("Timed out waiting for remote poke response from %s%s",
 599              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 600     cmd->monitor_timeout_id = 0;
 601     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 602                      "Remote executor did not respond");
 603 
 604     if (lrm_state && lrm_state->remote_ra_data) {
 605         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 606 
 607         if (ra_data->cur_cmd == cmd) {
 608             ra_data->cur_cmd = NULL;
 609         }
 610         if (ra_data->cmds) {
 611             mainloop_set_trigger(ra_data->work);
 612         }
 613     }
 614 
 615     report_remote_ra_result(cmd);
 616     free_cmd(cmd);
 617 
 618     if(lrm_state) {
 619         // @TODO Should we move this before reporting the result above?
 620         lrm_state_disconnect(lrm_state);
 621     }
 622     return FALSE;
 623 }
 624 
 625 static void
 626 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 627 {
 628     lrmd_event_data_t op = { 0, };
 629 
 630     if (lrm_state == NULL) {
 631         /* if lrm_state not given assume local */
 632         lrm_state = controld_get_executor_state(NULL, false);
 633     }
 634     pcmk__assert(lrm_state != NULL);
 635 
 636     op.type = lrmd_event_exec_complete;
 637     op.rsc_id = rsc_id;
 638     op.op_type = op_type;
 639     op.t_run = time(NULL);
 640     op.t_rcchange = op.t_run;
 641     op.call_id = generate_callid();
 642     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 643     process_lrm_event(lrm_state, &op, NULL, NULL);
 644 }
 645 
 646 void
 647 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 648 {
 649     gboolean cmd_handled = FALSE;
 650     lrm_state_t *lrm_state = NULL;
 651     remote_ra_data_t *ra_data = NULL;
 652     remote_ra_cmd_t *cmd = NULL;
 653 
 654     CRM_CHECK((op != NULL) && (op->remote_nodename != NULL), return);
 655 
 656     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 657               "(%d) status=%s (%d)",
 658               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 659               lrmd_event_type2str(op->type), op->remote_nodename,
 660               crm_exit_str((crm_exit_t) op->rc), op->rc,
 661               pcmk_exec_status_str(op->op_status), op->op_status);
 662 
 663     lrm_state = controld_get_executor_state(op->remote_nodename, false);
 664     if (!lrm_state || !lrm_state->remote_ra_data) {
 665         crm_debug("No state information found for remote connection event");
 666         return;
 667     }
 668     ra_data = lrm_state->remote_ra_data;
 669 
 670     if (op->type == lrmd_event_new_client) {
 671         // Another client has connected to the remote daemon
 672 
 673         if (pcmk_is_set(ra_data->status, expect_takeover)) {
 674             // Great, we knew this was coming
 675             lrm_remote_clear_flags(lrm_state, expect_takeover);
 676             lrm_remote_set_flags(lrm_state, takeover_complete);
 677 
 678         } else {
 679             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 680                     "unexpected client takeover", op->remote_nodename);
 681             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 682             /* Do not free lrm_state->conn yet. */
 683             /* It'll be freed in the following stop action. */
 684             lrm_state_disconnect_only(lrm_state);
 685         }
 686         return;
 687     }
 688 
 689     /* filter all EXEC events up */
 690     if (op->type == lrmd_event_exec_complete) {
 691         if (pcmk_is_set(ra_data->status, takeover_complete)) {
 692             crm_debug("ignoring event, this connection is taken over by another node");
 693         } else {
 694             lrm_op_callback(op);
 695         }
 696         return;
 697     }
 698 
 699     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 700 
 701         if (!pcmk_is_set(ra_data->status, remote_active)) {
 702             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 703                       lrm_state->node_name);
 704 
 705         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 706             crm_err("Lost connection to Pacemaker Remote node %s",
 707                     lrm_state->node_name);
 708             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 709             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 710 
 711         } else {
 712             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 713                        lrm_state->node_name);
 714             /* Do roughly what a 'stop' on the remote-resource would do */
 715             handle_remote_ra_stop(lrm_state, NULL);
 716             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 717             /* now fake the reply of a successful 'stop' */
 718             synthesize_lrmd_success(NULL, lrm_state->node_name,
 719                                     PCMK_ACTION_STOP);
 720         }
 721         return;
 722     }
 723 
 724     if (!ra_data->cur_cmd) {
 725         crm_debug("no event to match");
 726         return;
 727     }
 728 
 729     cmd = ra_data->cur_cmd;
 730 
 731     /* Start actions and migrate from actions complete after connection
 732      * comes back to us. */
 733     if ((op->type == lrmd_event_connect) && pcmk__is_up_action(cmd->action)) {
 734         if (op->connection_rc < 0) {
 735             int remaining = remaining_timeout_sec(cmd);
 736 
 737             if ((op->connection_rc == -ENOKEY)
 738                 || (op->connection_rc == -EKEYREJECTED)) {
 739                 // Hard error, don't retry
 740                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 741                                  PCMK_EXEC_ERROR,
 742                                  pcmk_strerror(op->connection_rc));
 743 
 744             } else if (remaining > 3) {
 745                 crm_trace("Rescheduling start (%ds remains before timeout)",
 746                           remaining);
 747                 pcmk__create_timer(1000, retry_start_cmd_cb, lrm_state);
 748                 return;
 749 
 750             } else {
 751                 crm_trace("Not enough time before timeout (%ds) "
 752                           "to reschedule start", remaining);
 753                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 754                                     PCMK_EXEC_TIMEOUT,
 755                                     "%s without enough time to retry",
 756                                     pcmk_strerror(op->connection_rc));
 757             }
 758 
 759         } else {
 760             lrm_state_reset_tables(lrm_state, TRUE);
 761             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 762             lrm_remote_set_flags(lrm_state, remote_active);
 763         }
 764 
 765         crm_debug("Remote connection event matched %s action", cmd->action);
 766         report_remote_ra_result(cmd);
 767         cmd_handled = TRUE;
 768 
 769     } else if ((op->type == lrmd_event_poke)
 770                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 771                                pcmk__str_casei)) {
 772 
 773         if (cmd->monitor_timeout_id) {
 774             g_source_remove(cmd->monitor_timeout_id);
 775             cmd->monitor_timeout_id = 0;
 776         }
 777 
 778         /* Only report success the first time, after that only worry about failures.
 779          * For this function, if we get the poke pack, it is always a success. Pokes
 780          * only fail if the send fails, or the response times out. */
 781         if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
 782             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 783             report_remote_ra_result(cmd);
 784             cmd_set_flags(cmd, cmd_reported_success);
 785         }
 786 
 787         crm_debug("Remote poke event matched %s action", cmd->action);
 788 
 789         /* success, keep rescheduling if interval is present. */
 790         if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
 791             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 792             cmd->interval_id = pcmk__create_timer(cmd->interval_ms,
 793                                                   recurring_helper, cmd);
 794             cmd = NULL;         /* prevent free */
 795         }
 796         cmd_handled = TRUE;
 797 
 798     } else if ((op->type == lrmd_event_disconnect)
 799                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 800                                pcmk__str_casei)) {
 801         if (pcmk_is_set(ra_data->status, remote_active) &&
 802             !pcmk_is_set(cmd->status, cmd_cancel)) {
 803             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 804                              PCMK_EXEC_ERROR,
 805                              "Remote connection unexpectedly dropped "
 806                              "during monitor");
 807             report_remote_ra_result(cmd);
 808             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 809                     lrm_state->node_name);
 810         }
 811         cmd_handled = TRUE;
 812 
 813     } else {
 814         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 815     }
 816 
 817     if (cmd_handled) {
 818         ra_data->cur_cmd = NULL;
 819         if (ra_data->cmds) {
 820             mainloop_set_trigger(ra_data->work);
 821         }
 822         free_cmd(cmd);
 823     }
 824 }
 825 
 826 static void
 827 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 828 {
 829     remote_ra_data_t *ra_data = NULL;
 830 
 831     pcmk__assert(lrm_state != NULL);
 832     ra_data = lrm_state->remote_ra_data;
 833 
 834     if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 835         /* delete pending ops when ever the remote connection is intentionally stopped */
 836         g_hash_table_remove_all(lrm_state->active_ops);
 837     } else {
 838         /* we no longer hold the history if this connection has been migrated,
 839          * however, we keep metadata cache for future use */
 840         lrm_state_reset_tables(lrm_state, FALSE);
 841     }
 842 
 843     lrm_remote_clear_flags(lrm_state, remote_active);
 844     lrm_state_disconnect(lrm_state);
 845 
 846     if (ra_data->cmds) {
 847         g_list_free_full(ra_data->cmds, free_cmd);
 848     }
 849     if (ra_data->recurring_cmds) {
 850         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 851     }
 852     ra_data->cmds = NULL;
 853     ra_data->recurring_cmds = NULL;
 854     ra_data->cur_cmd = NULL;
 855 
 856     if (cmd) {
 857         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 858         report_remote_ra_result(cmd);
 859     }
 860 }
 861 
 862 // \return Standard Pacemaker return code
 863 static int
 864 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 865 {
 866     const char *server = NULL;
 867     lrmd_key_value_t *tmp = NULL;
 868     int port = 0;
 869     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 870     int rc = pcmk_rc_ok;
 871 
 872     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 873         if (pcmk__strcase_any_of(tmp->key,
 874                                  PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
 875                                  NULL)) {
 876             server = tmp->value;
 877 
 878         } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
 879                                 pcmk__str_none)) {
 880             port = atoi(tmp->value);
 881 
 882         } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
 883                                 pcmk__str_none)) {
 884             lrm_remote_set_flags(lrm_state, controlling_guest);
 885         }
 886     }
 887 
 888     rc = controld_connect_remote_executor(lrm_state, server, port,
 889                                           timeout_used);
 890     if (rc != pcmk_rc_ok) {
 891         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 892                             PCMK_EXEC_ERROR,
 893                             "Could not connect to Pacemaker Remote node %s: %s",
 894                             lrm_state->node_name, pcmk_rc_str(rc));
 895     }
 896     return rc;
 897 }
 898 
 899 static gboolean
 900 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 901 {
 902     int rc = 0;
 903     lrm_state_t *lrm_state = user_data;
 904     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 905     remote_ra_cmd_t *cmd;
 906     GList *first = NULL;
 907 
 908     if (ra_data->cur_cmd) {
 909         /* still waiting on previous cmd */
 910         return TRUE;
 911     }
 912 
 913     while (ra_data->cmds) {
 914         first = ra_data->cmds;
 915         cmd = first->data;
 916         if (cmd->delay_id) {
 917             /* still waiting for start delay timer to trip */
 918             return TRUE;
 919         }
 920 
 921         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 922         g_list_free_1(first);
 923 
 924         if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
 925                              PCMK_ACTION_MIGRATE_FROM, NULL)) {
 926             lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
 927             if (handle_remote_ra_start(lrm_state, cmd,
 928                                        cmd->timeout) == pcmk_rc_ok) {
 929                 /* take care of this later when we get async connection result */
 930                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 931                           cmd->action);
 932                 ra_data->cur_cmd = cmd;
 933                 return TRUE;
 934             }
 935             report_remote_ra_result(cmd);
 936 
 937         } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
 938 
 939             if (lrm_state_is_connected(lrm_state) == TRUE) {
 940                 rc = lrm_state_poke_connection(lrm_state);
 941                 if (rc < 0) {
 942                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 943                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 944                 }
 945             } else {
 946                 rc = -1;
 947                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 948                                  PCMK_EXEC_DONE, "Remote connection inactive");
 949             }
 950 
 951             if (rc == 0) {
 952                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 953                           cmd->rsc_id);
 954                 ra_data->cur_cmd = cmd;
 955                 cmd->monitor_timeout_id = pcmk__create_timer(cmd->timeout, monitor_timeout_cb, cmd);
 956                 return TRUE;
 957             }
 958             report_remote_ra_result(cmd);
 959 
 960         } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
 961 
 962             if (pcmk_is_set(ra_data->status, expect_takeover)) {
 963                 /* Briefly wait on stop for an expected takeover to occur. If
 964                  * the takeover does not occur during the wait, that's fine; it
 965                  * just means that the remote node's resource history will be
 966                  * cleared, which will require probing all resources on the
 967                  * remote node. If the takeover does occur successfully, then we
 968                  * can leave the status section intact.
 969                  */
 970                 cmd->takeover_timeout_id = pcmk__create_timer((cmd->timeout/2),
 971                                                               connection_takeover_timeout_cb,
 972                                                               cmd);
 973                 ra_data->cur_cmd = cmd;
 974                 return TRUE;
 975             }
 976 
 977             handle_remote_ra_stop(lrm_state, cmd);
 978 
 979         } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
 980             lrm_remote_clear_flags(lrm_state, takeover_complete);
 981             lrm_remote_set_flags(lrm_state, expect_takeover);
 982             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 983             report_remote_ra_result(cmd);
 984 
 985         } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
 986                                     PCMK_ACTION_RELOAD_AGENT, NULL))  {
 987             /* Currently the only reloadable parameter is
 988              * PCMK_REMOTE_RA_RECONNECT_INTERVAL, which is only used by the
 989              * scheduler via the CIB, so reloads are a no-op.
 990              *
 991              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 992              * we're in a rolling upgrade with a DC scheduling "reload" instead
 993              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 994              * so this would work for that purpose as well.
 995              */
 996             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 997             report_remote_ra_result(cmd);
 998         }
 999 
1000         free_cmd(cmd);
1001     }
1002 
1003     return TRUE;
1004 }
1005 
1006 static void
1007 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1008 {
1009     remote_ra_data_t *ra_data = NULL;
1010 
1011     if (lrm_state->remote_ra_data) {
1012         return;
1013     }
1014 
1015     ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
1016     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1017     lrm_state->remote_ra_data = ra_data;
1018 }
1019 
1020 void
1021 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1022 {
1023     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1024 
1025     if (!ra_data) {
1026         return;
1027     }
1028 
1029     if (ra_data->cmds) {
1030         g_list_free_full(ra_data->cmds, free_cmd);
1031     }
1032 
1033     if (ra_data->recurring_cmds) {
1034         g_list_free_full(ra_data->recurring_cmds, free_cmd);
1035     }
1036     mainloop_destroy_trigger(ra_data->work);
1037     free(ra_data);
1038     lrm_state->remote_ra_data = NULL;
1039 }
1040 
1041 gboolean
1042 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
1043 {
1044     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1045         return TRUE;
1046     }
1047     return (id != NULL) && (controld_get_executor_state(id, false) != NULL)
1048            && !controld_is_local_node(id);
1049 }
1050 
1051 lrmd_rsc_info_t *
1052 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
1053 {
1054     lrmd_rsc_info_t *info = NULL;
1055 
1056     CRM_CHECK(rsc_id != NULL, return NULL);
1057 
1058     if (controld_get_executor_state(rsc_id, false) != NULL) {
1059         info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
1060 
1061         info->id = pcmk__str_copy(rsc_id);
1062         info->type = pcmk__str_copy(REMOTE_LRMD_RA);
1063         info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
1064         info->provider = pcmk__str_copy("pacemaker");
1065     }
1066 
1067     return info;
1068 }
1069 
1070 static gboolean
1071 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
1072 {
1073     return pcmk__str_any_of(action,
1074                             PCMK_ACTION_START,
1075                             PCMK_ACTION_STOP,
1076                             PCMK_ACTION_MONITOR,
1077                             PCMK_ACTION_MIGRATE_TO,
1078                             PCMK_ACTION_MIGRATE_FROM,
1079                             PCMK_ACTION_RELOAD_AGENT,
1080                             PCMK_ACTION_RELOAD,
1081                             NULL);
1082 }
1083 
1084 static GList *
1085 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1086 {
1087     GList *rm_list = NULL;
1088     remote_ra_cmd_t *cmd = NULL;
1089     GList *gIter = NULL;
1090 
1091     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1092         cmd = gIter->data;
1093         if ((cmd->interval_ms > 0)
1094             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1095                             pcmk__str_casei)) {
1096             rm_list = g_list_append(rm_list, cmd);
1097         }
1098     }
1099 
1100     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1101         cmd = gIter->data;
1102 
1103         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1104                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
1105         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1106                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1107         report_remote_ra_result(cmd);
1108 
1109         list = g_list_remove(list, cmd);
1110         free_cmd(cmd);
1111     }
1112 
1113     /* frees only the list data, not the cmds */
1114     g_list_free(rm_list);
1115     return list;
1116 }
1117 
1118 static GList *
1119 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
1120 {
1121     remote_ra_cmd_t *cmd = NULL;
1122     GList *gIter = NULL;
1123 
1124     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1125         cmd = gIter->data;
1126         if ((cmd->interval_ms == interval_ms)
1127             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1128             break;
1129         }
1130         cmd = NULL;
1131     }
1132     if (cmd) {
1133         list = g_list_remove(list, cmd);
1134         free_cmd(cmd);
1135     }
1136     return list;
1137 }
1138 
1139 int
1140 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1141                  const char *action, guint interval_ms)
1142 {
1143     lrm_state_t *connection_rsc = NULL;
1144     remote_ra_data_t *ra_data = NULL;
1145 
1146     CRM_CHECK(rsc_id != NULL, return -EINVAL);
1147 
1148     connection_rsc = controld_get_executor_state(rsc_id, false);
1149     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1150         return -EINVAL;
1151     }
1152 
1153     ra_data = connection_rsc->remote_ra_data;
1154     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1155     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1156                                          interval_ms);
1157     if (ra_data->cur_cmd &&
1158         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1159         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1160 
1161         cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1162     }
1163 
1164     return 0;
1165 }
1166 
1167 static remote_ra_cmd_t *
1168 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1169                    const char *userdata)
1170 {
1171     GList *gIter = NULL;
1172     remote_ra_cmd_t *cmd = NULL;
1173 
1174     /* there are 3 places a potential duplicate monitor operation
1175      * could exist.
1176      * 1. recurring_cmds list. where the op is waiting for its next interval
1177      * 2. cmds list, where the op is queued to get executed immediately
1178      * 3. cur_cmd, which means the monitor op is in flight right now.
1179      */
1180     if (interval_ms == 0) {
1181         return NULL;
1182     }
1183 
1184     if (ra_data->cur_cmd &&
1185         !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1186         (ra_data->cur_cmd->interval_ms == interval_ms)
1187         && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1188                         pcmk__str_casei)) {
1189 
1190         cmd = ra_data->cur_cmd;
1191         goto handle_dup;
1192     }
1193 
1194     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1195         cmd = gIter->data;
1196         if ((cmd->interval_ms == interval_ms)
1197             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1198                             pcmk__str_casei)) {
1199             goto handle_dup;
1200         }
1201     }
1202 
1203     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1204         cmd = gIter->data;
1205         if ((cmd->interval_ms == interval_ms)
1206             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1207                             pcmk__str_casei)) {
1208             goto handle_dup;
1209         }
1210     }
1211 
1212     return NULL;
1213 
1214 handle_dup:
1215 
1216     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1217               cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1218 
1219     /* update the userdata */
1220     if (userdata) {
1221        free(cmd->userdata);
1222        cmd->userdata = pcmk__str_copy(userdata);
1223     }
1224 
1225     /* if we've already reported success, generate a new call id */
1226     if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1227         cmd->start_time = time(NULL);
1228         cmd->call_id = generate_callid();
1229         cmd_clear_flags(cmd, cmd_reported_success);
1230     }
1231 
1232     /* if we have an interval_id set, that means we are in the process of
1233      * waiting for this cmd's next interval. instead of waiting, cancel
1234      * the timer and execute the action immediately */
1235     if (cmd->interval_id) {
1236         g_source_remove(cmd->interval_id);
1237         cmd->interval_id = 0;
1238         recurring_helper(cmd);
1239     }
1240 
1241     return cmd;
1242 }
1243 
1244 /*!
1245  * \internal
1246  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1247  *
1248  * \param[in]     lrm_state      Executor state object for remote connection
1249  * \param[in]     rsc_id         Connection resource ID
1250  * \param[in]     action         Action to execute
1251  * \param[in]     userdata       String to copy and pass to execution callback
1252  * \param[in]     interval_ms    Action interval (in milliseconds)
1253  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1254  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1255  * \param[in,out] params         Connection resource parameters
1256  * \param[out]    call_id        Where to store call ID on success
1257  *
1258  * \return Standard Pacemaker return code
1259  * \note This takes ownership of \p params, which should not be used or freed
1260  *       after calling this function.
1261  */
1262 int
1263 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1264                               const char *action, const char *userdata,
1265                               guint interval_ms, int timeout_ms,
1266                               int start_delay_ms, lrmd_key_value_t *params,
1267                               int *call_id)
1268 {
1269     lrm_state_t *connection_rsc = NULL;
1270     remote_ra_cmd_t *cmd = NULL;
1271     remote_ra_data_t *ra_data = NULL;
1272 
1273     *call_id = 0;
1274 
1275     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1276               && (userdata != NULL) && (call_id != NULL),
1277               lrmd_key_value_freeall(params); return EINVAL);
1278 
1279     if (!is_remote_ra_supported_action(action)) {
1280         lrmd_key_value_freeall(params);
1281         return EOPNOTSUPP;
1282     }
1283 
1284     connection_rsc = controld_get_executor_state(rsc_id, false);
1285     if (connection_rsc == NULL) {
1286         lrmd_key_value_freeall(params);
1287         return ENOTCONN;
1288     }
1289 
1290     remote_ra_data_init(connection_rsc);
1291     ra_data = connection_rsc->remote_ra_data;
1292 
1293     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1294     if (cmd) {
1295         *call_id = cmd->call_id;
1296         lrmd_key_value_freeall(params);
1297         return pcmk_rc_ok;
1298     }
1299 
1300     cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1301 
1302     cmd->owner = pcmk__str_copy(lrm_state->node_name);
1303     cmd->rsc_id = pcmk__str_copy(rsc_id);
1304     cmd->action = pcmk__str_copy(action);
1305     cmd->userdata = pcmk__str_copy(userdata);
1306     cmd->interval_ms = interval_ms;
1307     cmd->timeout = timeout_ms;
1308     cmd->start_delay = start_delay_ms;
1309     cmd->params = params;
1310     cmd->start_time = time(NULL);
1311 
1312     cmd->call_id = generate_callid();
1313 
1314     if (cmd->start_delay) {
1315         cmd->delay_id = pcmk__create_timer(cmd->start_delay, start_delay_helper, cmd);
1316     }
1317 
1318     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1319     mainloop_set_trigger(ra_data->work);
1320 
1321     *call_id = cmd->call_id;
1322     return pcmk_rc_ok;
1323 }
1324 
1325 /*!
1326  * \internal
1327  * \brief Immediately fail all monitors of a remote node, if proxied here
1328  *
1329  * \param[in] node_name  Name of pacemaker_remote node
1330  */
1331 void
1332 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1333 {
1334     lrm_state_t *lrm_state = NULL;
1335 
1336     CRM_CHECK(node_name != NULL, return);
1337 
1338     lrm_state = controld_get_executor_state(node_name, false);
1339     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1340         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1341 
1342         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1343         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1344         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1345     }
1346 }
1347 
1348 /* A guest node fencing implied by host fencing looks like:
1349  *
1350  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1351  *                on_node="lxc1" on_node_uuid="lxc1">
1352  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1353  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1354  *     <downed>
1355  *       <node id="lxc1"/>
1356  *     </downed>
1357  *  </pseudo_event>
1358  */
1359 #define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1360     "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1361 
1362 /*!
1363  * \internal
1364  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1365  *
1366  * \param[in,out] xml  XML of pseudo-action to check
1367  */
1368 void
1369 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1370 {
1371     xmlXPathObject *search = pcmk__xpath_search(xml->doc, XPATH_PSEUDO_FENCE);
1372 
1373     if (pcmk__xpath_num_results(search) == 1) {
1374         xmlNode *result = pcmk__xpath_result(search, 0);
1375 
1376         /* Normally, we handle the necessary side effects of a guest node stop
1377          * action when reporting the remote agent's result. However, if the stop
1378          * is implied due to fencing, it will be a fencing pseudo-event, and
1379          * there won't be a result to report. Handle that case here.
1380          *
1381          * This will result in a duplicate call to remote_node_down() if the
1382          * guest stop was real instead of implied, but that shouldn't hurt.
1383          *
1384          * There is still one corner case that isn't handled: if a guest node
1385          * isn't running any resources when its host is fenced, it will appear
1386          * to be cleanly stopped, so there will be no pseudo-fence, and our
1387          * peer cache state will be incorrect unless and until the guest is
1388          * recovered.
1389          */
1390         if (result) {
1391             const char *remote = pcmk__xe_id(result);
1392 
1393             if (remote) {
1394                 remote_node_down(remote, DOWN_ERASE_LRM);
1395             }
1396         }
1397     }
1398     xmlXPathFreeObject(search);
1399 }
1400 
1401 static void
1402 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1403 {
1404     xmlNode *update, *state;
1405     int call_opt;
1406     pcmk__node_status_t *node = NULL;
1407 
1408     call_opt = crmd_cib_smart_opt();
1409     node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1410     CRM_CHECK(node != NULL, return);
1411     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1412     state = create_node_state_update(node, controld_node_update_none, update,
1413                                      __func__);
1414     crm_xml_add(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1415     if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1416                             NULL) == pcmk_rc_ok) {
1417         /* TODO: still not 100% sure that async update will succeed ... */
1418         if (maintenance) {
1419             lrm_remote_set_flags(lrm_state, remote_in_maint);
1420         } else {
1421             lrm_remote_clear_flags(lrm_state, remote_in_maint);
1422         }
1423     }
1424     pcmk__xml_free(update);
1425 }
1426 
1427 #define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT         \
1428     "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1429     PCMK__XE_MAINTENANCE
1430 
1431 /*!
1432  * \internal
1433  * \brief Check a pseudo-action holding updates for maintenance state
1434  *
1435  * \param[in,out] xml  XML of pseudo-action to check
1436  */
1437 void
1438 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1439 {
1440     xmlXPathObject *search = pcmk__xpath_search(xml->doc,
1441                                                 XPATH_PSEUDO_MAINTENANCE);
1442 
1443     if (pcmk__xpath_num_results(search) == 1) {
1444         xmlNode *node;
1445         int cnt = 0, cnt_remote = 0;
1446 
1447         for (node = pcmk__xe_first_child(pcmk__xpath_result(search, 0),
1448                                          PCMK_XE_NODE, NULL, NULL);
1449              node != NULL; node = pcmk__xe_next(node, PCMK_XE_NODE)) {
1450 
1451             lrm_state_t *lrm_state = NULL;
1452             const char *id = pcmk__xe_id(node);
1453 
1454             cnt++;
1455             if (id == NULL) {
1456                 continue; // Shouldn't be possible
1457             }
1458 
1459             lrm_state = controld_get_executor_state(id, false);
1460 
1461             if (lrm_state && lrm_state->remote_ra_data &&
1462                 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1463 
1464                 const char *in_maint_s = NULL;
1465                 int in_maint;
1466 
1467                 cnt_remote++;
1468                 in_maint_s = crm_element_value(node,
1469                                                PCMK__XA_NODE_IN_MAINTENANCE);
1470                 pcmk__scan_min_int(in_maint_s, &in_maint, 0);
1471                 remote_ra_maintenance(lrm_state, in_maint);
1472             }
1473         }
1474         crm_trace("Action holds %d nodes (%d remotes found) adjusting "
1475                   PCMK_OPT_MAINTENANCE_MODE,
1476                   cnt, cnt_remote);
1477     }
1478     xmlXPathFreeObject(search);
1479 }
1480 
1481 gboolean
1482 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1483 {
1484     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1485     return pcmk_is_set(ra_data->status, remote_in_maint);
1486 }
1487 
1488 gboolean
1489 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1490 {
1491     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1492     return pcmk_is_set(ra_data->status, controlling_guest);
1493 }

/* [previous][next][first][last][top][bottom][index][help] */