root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. should_purge_attributes
  6. section_to_delete
  7. purge_remote_node_attrs
  8. remote_node_up
  9. remote_node_down
  10. check_remote_node_state
  11. report_remote_ra_result
  12. update_remaining_timeout
  13. retry_start_cmd_cb
  14. connection_takeover_timeout_cb
  15. monitor_timeout_cb
  16. synthesize_lrmd_success
  17. remote_lrm_op_callback
  18. handle_remote_ra_stop
  19. handle_remote_ra_start
  20. handle_remote_ra_exec
  21. remote_ra_data_init
  22. remote_ra_cleanup
  23. is_remote_lrmd_ra
  24. remote_ra_get_rsc_info
  25. is_remote_ra_supported_action
  26. fail_all_monitor_cmds
  27. remove_cmd
  28. remote_ra_cancel
  29. handle_dup_monitor
  30. controld_execute_remote_agent
  31. remote_ra_fail
  32. remote_ra_process_pseudo
  33. remote_ra_maintenance
  34. remote_ra_process_maintenance_nodes
  35. remote_ra_is_in_maintenance
  36. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 #define cmd_set_flags(cmd, flags_to_set) do { \
  27     (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
  28                                        "Remote command", (cmd)->rsc_id, (cmd)->status, \
  29                                        (flags_to_set), #flags_to_set); \
  30         } while (0)
  31 
  32 #define cmd_clear_flags(cmd, flags_to_clear) do { \
  33     (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
  34                                          "Remote command", (cmd)->rsc_id, (cmd)->status, \
  35                                          (flags_to_clear), #flags_to_clear); \
  36         } while (0)
  37 
  38 enum remote_cmd_status {
  39     cmd_reported_success    = (1 << 0),
  40     cmd_cancel              = (1 << 1),
  41 };
  42 
  43 typedef struct remote_ra_cmd_s {
  44     /*! the local node the cmd is issued from */
  45     char *owner;
  46     /*! the remote node the cmd is executed on */
  47     char *rsc_id;
  48     /*! the action to execute */
  49     char *action;
  50     /*! some string the client wants us to give it back */
  51     char *userdata;
  52     /*! start delay in ms */
  53     int start_delay;
  54     /*! timer id used for start delay. */
  55     int delay_id;
  56     /*! timeout in ms for cmd */
  57     int timeout;
  58     int remaining_timeout;
  59     /*! recurring interval in ms */
  60     guint interval_ms;
  61     /*! interval timer id */
  62     int interval_id;
  63     int monitor_timeout_id;
  64     int takeover_timeout_id;
  65     /*! action parameters */
  66     lrmd_key_value_t *params;
  67     pcmk__action_result_t result;
  68     int call_id;
  69     time_t start_time;
  70     uint32_t status;
  71 } remote_ra_cmd_t;
  72 
  73 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
  74     lrm_state_t *lrm = (lrm_state); \
  75     remote_ra_data_t *ra = lrm->remote_ra_data; \
  76     ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  77                                     lrm->node_name, ra->status, \
  78                                     (flags_to_set), #flags_to_set); \
  79         } while (0)
  80 
  81 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
  82     lrm_state_t *lrm = (lrm_state); \
  83     remote_ra_data_t *ra = lrm->remote_ra_data; \
  84     ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  85                                       lrm->node_name, ra->status, \
  86                                       (flags_to_clear), #flags_to_clear); \
  87         } while (0)
  88 
  89 enum remote_status {
  90     expect_takeover     = (1 << 0),
  91     takeover_complete   = (1 << 1),
  92     remote_active       = (1 << 2),
  93     /* Maintenance mode is difficult to determine from the controller's context,
  94      * so we have it signalled back with the transition from the scheduler.
  95      */
  96     remote_in_maint     = (1 << 3),
  97     /* Similar for whether we are controlling a guest node or remote node.
  98      * Fortunately there is a meta-attribute in the transition already and
  99      * as the situation doesn't change over time we can use the
 100      * resource start for noting down the information for later use when
 101      * the attributes aren't at hand.
 102      */
 103     controlling_guest   = (1 << 4),
 104 };
 105 
 106 typedef struct remote_ra_data_s {
 107     crm_trigger_t *work;
 108     remote_ra_cmd_t *cur_cmd;
 109     GList *cmds;
 110     GList *recurring_cmds;
 111     uint32_t status;
 112 } remote_ra_data_t;
 113 
 114 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 115 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 116 static GList *fail_all_monitor_cmds(GList * list);
 117 
 118 static void
 119 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 120 {
 121     remote_ra_cmd_t *cmd = user_data;
 122 
 123     if (!cmd) {
 124         return;
 125     }
 126     if (cmd->delay_id) {
 127         g_source_remove(cmd->delay_id);
 128     }
 129     if (cmd->interval_id) {
 130         g_source_remove(cmd->interval_id);
 131     }
 132     if (cmd->monitor_timeout_id) {
 133         g_source_remove(cmd->monitor_timeout_id);
 134     }
 135     if (cmd->takeover_timeout_id) {
 136         g_source_remove(cmd->takeover_timeout_id);
 137     }
 138     free(cmd->owner);
 139     free(cmd->rsc_id);
 140     free(cmd->action);
 141     free(cmd->userdata);
 142     pcmk__reset_result(&(cmd->result));
 143     lrmd_key_value_freeall(cmd->params);
 144     free(cmd);
 145 }
 146 
 147 static int
 148 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 149 {
 150     static int remote_ra_callid = 0;
 151 
 152     remote_ra_callid++;
 153     if (remote_ra_callid <= 0) {
 154         remote_ra_callid = 1;
 155     }
 156 
 157     return remote_ra_callid;
 158 }
 159 
 160 static gboolean
 161 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 162 {
 163     remote_ra_cmd_t *cmd = data;
 164     lrm_state_t *connection_rsc = NULL;
 165 
 166     cmd->interval_id = 0;
 167     connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
 168     if (connection_rsc && connection_rsc->remote_ra_data) {
 169         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 170 
 171         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 172 
 173         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 174         mainloop_set_trigger(ra_data->work);
 175     }
 176     return FALSE;
 177 }
 178 
 179 static gboolean
 180 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 181 {
 182     remote_ra_cmd_t *cmd = data;
 183     lrm_state_t *connection_rsc = NULL;
 184 
 185     cmd->delay_id = 0;
 186     connection_rsc = controld_get_executor_state(cmd->rsc_id, false);
 187     if (connection_rsc && connection_rsc->remote_ra_data) {
 188         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 189 
 190         mainloop_set_trigger(ra_data->work);
 191     }
 192     return FALSE;
 193 }
 194 
 195 static bool
 196 should_purge_attributes(pcmk__node_status_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 197 {
 198     pcmk__node_status_t *conn_node = NULL;
 199     lrm_state_t *connection_rsc = NULL;
 200 
 201     if ((node->conn_host == NULL) || (node->name == NULL)) {
 202         return true;
 203     }
 204 
 205     /* Get the node that was hosting the remote connection resource from the
 206      * peer cache.  That's the one we really care about here.
 207      */
 208     conn_node = pcmk__get_node(0, node->conn_host, NULL,
 209                                pcmk__node_search_cluster_member);
 210     if (conn_node == NULL) {
 211         return true;
 212     }
 213 
 214     /* Check the uptime of connection_rsc.  If it hasn't been running long
 215      * enough, set purge=true.  "Long enough" means it started running earlier
 216      * than the timestamp when we noticed it went away in the first place.
 217      */
 218     connection_rsc = controld_get_executor_state(node->name, false);
 219 
 220     if (connection_rsc != NULL) {
 221         lrmd_t *lrm = connection_rsc->conn;
 222         time_t uptime = lrmd__uptime(lrm);
 223         time_t now = time(NULL);
 224 
 225         /* Add 20s of fuzziness to give corosync a while to notice the remote
 226          * host is gone.  On various error conditions (failure to get uptime,
 227          * peer_lost isn't set) we default to purging.
 228          */
 229         if (uptime > 0 &&
 230             conn_node->peer_lost > 0 &&
 231             uptime + 20 >= now - conn_node->peer_lost) {
 232             return false;
 233         }
 234     }
 235 
 236     return true;
 237 }
 238 
 239 static enum controld_section_e
 240 section_to_delete(bool purge)
     /* [previous][next][first][last][top][bottom][index][help] */
 241 {
 242     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 243         if (purge) {
 244             return controld_section_all_unlocked;
 245         } else {
 246             return controld_section_lrm_unlocked;
 247         }
 248     } else {
 249         if (purge) {
 250             return controld_section_all;
 251         } else {
 252             return controld_section_lrm;
 253         }
 254     }
 255 }
 256 
 257 static void
 258 purge_remote_node_attrs(int call_opt, pcmk__node_status_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 259 {
 260     bool purge = should_purge_attributes(node);
 261     enum controld_section_e section = section_to_delete(purge);
 262 
 263     /* Purge node from attrd's memory */
 264     if (purge) {
 265         update_attrd_remote_node_removed(node->name, NULL);
 266     }
 267 
 268     controld_delete_node_state(node->name, section, call_opt);
 269 }
 270 
 271 /*!
 272  * \internal
 273  * \brief Handle cluster communication related to pacemaker_remote node joining
 274  *
 275  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 276  */
 277 static void
 278 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 279 {
 280     int call_opt;
 281     xmlNode *update, *state;
 282     pcmk__node_status_t *node = NULL;
 283     lrm_state_t *connection_rsc = NULL;
 284 
 285     CRM_CHECK(node_name != NULL, return);
 286     crm_info("Announcing Pacemaker Remote node %s", node_name);
 287 
 288     call_opt = crmd_cib_smart_opt();
 289 
 290     /* Delete node's CRM_OP_PROBED attribute. Deleting any attribute ensures
 291      * that the attribute manager learns the node is remote. Deletion of this
 292      * specfic attribute is a holdover from when it had special meaning.
 293      *
 294      * @COMPAT Find another way to tell attrd that the node is remote, without
 295      * risking deletion or overwrite of an arbitrary attribute. Then work on
 296      * deprecating CRM_OP_PROBED.
 297      */
 298     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 299 
 300     /* Ensure node is in the remote peer cache with member status */
 301     node = pcmk__cluster_lookup_remote_node(node_name);
 302     CRM_CHECK((node != NULL) && (node->name != NULL), return);
 303 
 304     purge_remote_node_attrs(call_opt, node);
 305     pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
 306 
 307     /* Apply any start state that we were given from the environment on the
 308      * remote node.
 309      */
 310     connection_rsc = controld_get_executor_state(node->name, false);
 311 
 312     if (connection_rsc != NULL) {
 313         lrmd_t *lrm = connection_rsc->conn;
 314         const char *start_state = lrmd__node_start_state(lrm);
 315 
 316         if (start_state) {
 317             set_join_state(start_state, node->name, node->xml_id, true);
 318         }
 319     }
 320 
 321     /* pacemaker_remote nodes don't participate in the membership layer,
 322      * so cluster nodes don't automatically get notified when they come and go.
 323      * We send a cluster message to the DC, and update the CIB node state entry,
 324      * so the DC will get it sooner (via message) or later (via CIB refresh),
 325      * and any other interested parties can query the CIB.
 326      */
 327     broadcast_remote_state_message(node_name, true);
 328 
 329     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 330     state = create_node_state_update(node, node_update_cluster, update,
 331                                      __func__);
 332 
 333     /* Clear the PCMK__XA_NODE_FENCED flag in the node state. If the node ever
 334      * needs to be fenced, this flag will allow various actions to determine
 335      * whether the fencing has happened yet.
 336      */
 337     crm_xml_add(state, PCMK__XA_NODE_FENCED, "0");
 338 
 339     /* TODO: If the remote connection drops, and this (async) CIB update either
 340      * failed or has not yet completed, later actions could mistakenly think the
 341      * node has already been fenced (if the PCMK__XA_NODE_FENCED attribute was
 342      * previously set, because it won't have been cleared). This could prevent
 343      * actual fencing or allow recurring monitor failures to be cleared too
 344      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 345      */
 346     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 347     pcmk__xml_free(update);
 348 }
 349 
 350 enum down_opts {
 351     DOWN_KEEP_LRM,
 352     DOWN_ERASE_LRM
 353 };
 354 
 355 /*!
 356  * \internal
 357  * \brief Handle cluster communication related to pacemaker_remote node leaving
 358  *
 359  * \param[in] node_name  Name of lost node
 360  * \param[in] opts       Whether to keep or erase LRM history
 361  */
 362 static void
 363 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 364 {
 365     xmlNode *update;
 366     int call_opt = crmd_cib_smart_opt();
 367     pcmk__node_status_t *node = NULL;
 368 
 369     /* Purge node from attrd's memory */
 370     update_attrd_remote_node_removed(node_name, NULL);
 371 
 372     /* Normally, only node attributes should be erased, and the resource history
 373      * should be kept until the node comes back up. However, after a successful
 374      * fence, we want to clear the history as well, so we don't think resources
 375      * are still running on the node.
 376      */
 377     if (opts == DOWN_ERASE_LRM) {
 378         controld_delete_node_state(node_name, controld_section_all, call_opt);
 379     } else {
 380         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 381     }
 382 
 383     /* Ensure node is in the remote peer cache with lost state */
 384     node = pcmk__cluster_lookup_remote_node(node_name);
 385     CRM_CHECK(node != NULL, return);
 386     pcmk__update_peer_state(__func__, node, PCMK__VALUE_LOST, 0);
 387 
 388     /* Notify DC */
 389     broadcast_remote_state_message(node_name, false);
 390 
 391     /* Update CIB node state */
 392     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 393     create_node_state_update(node, node_update_cluster, update, __func__);
 394     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 395     pcmk__xml_free(update);
 396 }
 397 
 398 /*!
 399  * \internal
 400  * \brief Handle effects of a remote RA command on node state
 401  *
 402  * \param[in] cmd  Completed remote RA command
 403  */
 404 static void
 405 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 406 {
 407     /* Only successful actions can change node state */
 408     if (!pcmk__result_ok(&(cmd->result))) {
 409         return;
 410     }
 411 
 412     if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
 413         remote_node_up(cmd->rsc_id);
 414 
 415     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
 416                             pcmk__str_casei)) {
 417         /* After a successful migration, we don't need to do remote_node_up()
 418          * because the DC already knows the node is up, and we don't want to
 419          * clear LRM history etc. We do need to add the remote node to this
 420          * host's remote peer cache, because (unless it happens to be DC)
 421          * it hasn't been tracking the remote node, and other code relies on
 422          * the cache to distinguish remote nodes from unseen cluster nodes.
 423          */
 424         pcmk__node_status_t *node =
 425             pcmk__cluster_lookup_remote_node(cmd->rsc_id);
 426 
 427         CRM_CHECK(node != NULL, return);
 428         pcmk__update_peer_state(__func__, node, PCMK_VALUE_MEMBER, 0);
 429 
 430     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
 431         lrm_state_t *lrm_state = controld_get_executor_state(cmd->rsc_id,
 432                                                              false);
 433         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 434 
 435         if (ra_data) {
 436             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 437                 /* Stop means down if we didn't successfully migrate elsewhere */
 438                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 439             } else if (AM_I_DC == FALSE) {
 440                 /* Only the connection host and DC track node state,
 441                  * so if the connection migrated elsewhere and we aren't DC,
 442                  * un-cache the node, so we don't have stale info
 443                  */
 444                 pcmk__cluster_forget_remote_node(cmd->rsc_id);
 445             }
 446         }
 447     }
 448 
 449     /* We don't do anything for successful monitors, which is correct for
 450      * routine recurring monitors, and for monitors on nodes where the
 451      * connection isn't supposed to be (the cluster will stop the connection in
 452      * that case). However, if the initial probe finds the connection already
 453      * active on the node where we want it, we probably should do
 454      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 455      * Given that connections have to be initiated by the cluster, the chance of
 456      * that should be close to zero.
 457      */
 458 }
 459 
 460 static void
 461 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 462 {
 463     lrmd_event_data_t op = { 0, };
 464 
 465     check_remote_node_state(cmd);
 466 
 467     op.type = lrmd_event_exec_complete;
 468     op.rsc_id = cmd->rsc_id;
 469     op.op_type = cmd->action;
 470     op.user_data = cmd->userdata;
 471     op.timeout = cmd->timeout;
 472     op.interval_ms = cmd->interval_ms;
 473     op.t_run = cmd->start_time;
 474     op.t_rcchange = cmd->start_time;
 475 
 476     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 477                      cmd->result.exit_reason);
 478 
 479     if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
 480         op.t_rcchange = time(NULL);
 481         /* This edge case will likely never ever occur, but if it does the
 482          * result is that a failure will not be processed correctly. This is only
 483          * remotely possible because we are able to detect a connection resource's tcp
 484          * connection has failed at any moment after start has completed. The actual
 485          * recurring operation is just a connectivity ping.
 486          *
 487          * basically, we are not guaranteed that the first successful monitor op and
 488          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 489          * make it look like the operations occurred at separate times though. */
 490         if (op.t_rcchange == op.t_run) {
 491             op.t_rcchange++;
 492         }
 493     }
 494 
 495     if (cmd->params) {
 496         lrmd_key_value_t *tmp;
 497 
 498         op.params = pcmk__strkey_table(free, free);
 499         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 500             pcmk__insert_dup(op.params, tmp->key, tmp->value);
 501         }
 502 
 503     }
 504     op.call_id = cmd->call_id;
 505     op.remote_nodename = cmd->owner;
 506 
 507     lrm_op_callback(&op);
 508 
 509     if (op.params) {
 510         g_hash_table_destroy(op.params);
 511     }
 512     lrmd__reset_result(&op);
 513 }
 514 
 515 static void
 516 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 517 {
 518     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 519 }
 520 
 521 static gboolean
 522 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 523 {
 524     lrm_state_t *lrm_state = data;
 525     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 526     remote_ra_cmd_t *cmd = NULL;
 527     int rc = ETIME;
 528 
 529     if (!ra_data || !ra_data->cur_cmd) {
 530         return FALSE;
 531     }
 532     cmd = ra_data->cur_cmd;
 533     if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 534                               PCMK_ACTION_MIGRATE_FROM, NULL)) {
 535         return FALSE;
 536     }
 537     update_remaining_timeout(cmd);
 538 
 539     if (cmd->remaining_timeout > 0) {
 540         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 541     } else {
 542         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 543                          PCMK_EXEC_TIMEOUT,
 544                          "Not enough time remains to retry remote connection");
 545     }
 546 
 547     if (rc != pcmk_rc_ok) {
 548         report_remote_ra_result(cmd);
 549 
 550         if (ra_data->cmds) {
 551             mainloop_set_trigger(ra_data->work);
 552         }
 553         ra_data->cur_cmd = NULL;
 554         free_cmd(cmd);
 555     } else {
 556         /* wait for connection event */
 557     }
 558 
 559     return FALSE;
 560 }
 561 
 562 
 563 static gboolean
 564 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 565 {
 566     lrm_state_t *lrm_state = NULL;
 567     remote_ra_cmd_t *cmd = data;
 568 
 569     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 570     cmd->takeover_timeout_id = 0;
 571 
 572     lrm_state = controld_get_executor_state(cmd->rsc_id, false);
 573 
 574     handle_remote_ra_stop(lrm_state, cmd);
 575     free_cmd(cmd);
 576 
 577     return FALSE;
 578 }
 579 
 580 static gboolean
 581 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 582 {
 583     lrm_state_t *lrm_state = NULL;
 584     remote_ra_cmd_t *cmd = data;
 585 
 586     lrm_state = controld_get_executor_state(cmd->rsc_id, false);
 587 
 588     crm_info("Timed out waiting for remote poke response from %s%s",
 589              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 590     cmd->monitor_timeout_id = 0;
 591     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 592                      "Remote executor did not respond");
 593 
 594     if (lrm_state && lrm_state->remote_ra_data) {
 595         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 596 
 597         if (ra_data->cur_cmd == cmd) {
 598             ra_data->cur_cmd = NULL;
 599         }
 600         if (ra_data->cmds) {
 601             mainloop_set_trigger(ra_data->work);
 602         }
 603     }
 604 
 605     report_remote_ra_result(cmd);
 606     free_cmd(cmd);
 607 
 608     if(lrm_state) {
 609         lrm_state_disconnect(lrm_state);
 610     }
 611     return FALSE;
 612 }
 613 
 614 static void
 615 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 616 {
 617     lrmd_event_data_t op = { 0, };
 618 
 619     if (lrm_state == NULL) {
 620         /* if lrm_state not given assume local */
 621         lrm_state = controld_get_executor_state(NULL, false);
 622     }
 623     pcmk__assert(lrm_state != NULL);
 624 
 625     op.type = lrmd_event_exec_complete;
 626     op.rsc_id = rsc_id;
 627     op.op_type = op_type;
 628     op.t_run = time(NULL);
 629     op.t_rcchange = op.t_run;
 630     op.call_id = generate_callid();
 631     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 632     process_lrm_event(lrm_state, &op, NULL, NULL);
 633 }
 634 
 635 void
 636 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 637 {
 638     gboolean cmd_handled = FALSE;
 639     lrm_state_t *lrm_state = NULL;
 640     remote_ra_data_t *ra_data = NULL;
 641     remote_ra_cmd_t *cmd = NULL;
 642 
 643     CRM_CHECK((op != NULL) && (op->remote_nodename != NULL), return);
 644 
 645     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 646               "(%d) status=%s (%d)",
 647               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 648               lrmd_event_type2str(op->type), op->remote_nodename,
 649               crm_exit_str((crm_exit_t) op->rc), op->rc,
 650               pcmk_exec_status_str(op->op_status), op->op_status);
 651 
 652     lrm_state = controld_get_executor_state(op->remote_nodename, false);
 653     if (!lrm_state || !lrm_state->remote_ra_data) {
 654         crm_debug("No state information found for remote connection event");
 655         return;
 656     }
 657     ra_data = lrm_state->remote_ra_data;
 658 
 659     if (op->type == lrmd_event_new_client) {
 660         // Another client has connected to the remote daemon
 661 
 662         if (pcmk_is_set(ra_data->status, expect_takeover)) {
 663             // Great, we knew this was coming
 664             lrm_remote_clear_flags(lrm_state, expect_takeover);
 665             lrm_remote_set_flags(lrm_state, takeover_complete);
 666 
 667         } else {
 668             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 669                     "unexpected client takeover", op->remote_nodename);
 670             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 671             /* Do not free lrm_state->conn yet. */
 672             /* It'll be freed in the following stop action. */
 673             lrm_state_disconnect_only(lrm_state);
 674         }
 675         return;
 676     }
 677 
 678     /* filter all EXEC events up */
 679     if (op->type == lrmd_event_exec_complete) {
 680         if (pcmk_is_set(ra_data->status, takeover_complete)) {
 681             crm_debug("ignoring event, this connection is taken over by another node");
 682         } else {
 683             lrm_op_callback(op);
 684         }
 685         return;
 686     }
 687 
 688     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 689 
 690         if (!pcmk_is_set(ra_data->status, remote_active)) {
 691             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 692                       lrm_state->node_name);
 693 
 694         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 695             crm_err("Lost connection to Pacemaker Remote node %s",
 696                     lrm_state->node_name);
 697             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 698             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 699 
 700         } else {
 701             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 702                        lrm_state->node_name);
 703             /* Do roughly what a 'stop' on the remote-resource would do */
 704             handle_remote_ra_stop(lrm_state, NULL);
 705             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 706             /* now fake the reply of a successful 'stop' */
 707             synthesize_lrmd_success(NULL, lrm_state->node_name,
 708                                     PCMK_ACTION_STOP);
 709         }
 710         return;
 711     }
 712 
 713     if (!ra_data->cur_cmd) {
 714         crm_debug("no event to match");
 715         return;
 716     }
 717 
 718     cmd = ra_data->cur_cmd;
 719 
 720     /* Start actions and migrate from actions complete after connection
 721      * comes back to us. */
 722     if ((op->type == lrmd_event_connect)
 723         && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 724                                 PCMK_ACTION_MIGRATE_FROM, NULL)) {
 725         if (op->connection_rc < 0) {
 726             update_remaining_timeout(cmd);
 727 
 728             if ((op->connection_rc == -ENOKEY)
 729                 || (op->connection_rc == -EKEYREJECTED)) {
 730                 // Hard error, don't retry
 731                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 732                                  PCMK_EXEC_ERROR,
 733                                  pcmk_strerror(op->connection_rc));
 734 
 735             } else if (cmd->remaining_timeout > 3000) {
 736                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 737                 pcmk__create_timer(1000, retry_start_cmd_cb, lrm_state);
 738                 return;
 739 
 740             } else {
 741                 crm_trace("can't reschedule start, remaining timeout too small %d",
 742                           cmd->remaining_timeout);
 743                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 744                                     PCMK_EXEC_TIMEOUT,
 745                                     "%s without enough time to retry",
 746                                     pcmk_strerror(op->connection_rc));
 747             }
 748 
 749         } else {
 750             lrm_state_reset_tables(lrm_state, TRUE);
 751             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 752             lrm_remote_set_flags(lrm_state, remote_active);
 753         }
 754 
 755         crm_debug("Remote connection event matched %s action", cmd->action);
 756         report_remote_ra_result(cmd);
 757         cmd_handled = TRUE;
 758 
 759     } else if ((op->type == lrmd_event_poke)
 760                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 761                                pcmk__str_casei)) {
 762 
 763         if (cmd->monitor_timeout_id) {
 764             g_source_remove(cmd->monitor_timeout_id);
 765             cmd->monitor_timeout_id = 0;
 766         }
 767 
 768         /* Only report success the first time, after that only worry about failures.
 769          * For this function, if we get the poke pack, it is always a success. Pokes
 770          * only fail if the send fails, or the response times out. */
 771         if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
 772             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 773             report_remote_ra_result(cmd);
 774             cmd_set_flags(cmd, cmd_reported_success);
 775         }
 776 
 777         crm_debug("Remote poke event matched %s action", cmd->action);
 778 
 779         /* success, keep rescheduling if interval is present. */
 780         if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
 781             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 782             cmd->interval_id = pcmk__create_timer(cmd->interval_ms,
 783                                                   recurring_helper, cmd);
 784             cmd = NULL;         /* prevent free */
 785         }
 786         cmd_handled = TRUE;
 787 
 788     } else if ((op->type == lrmd_event_disconnect)
 789                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 790                                pcmk__str_casei)) {
 791         if (pcmk_is_set(ra_data->status, remote_active) &&
 792             !pcmk_is_set(cmd->status, cmd_cancel)) {
 793             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 794                              PCMK_EXEC_ERROR,
 795                              "Remote connection unexpectedly dropped "
 796                              "during monitor");
 797             report_remote_ra_result(cmd);
 798             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 799                     lrm_state->node_name);
 800         }
 801         cmd_handled = TRUE;
 802 
 803     } else {
 804         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 805     }
 806 
 807     if (cmd_handled) {
 808         ra_data->cur_cmd = NULL;
 809         if (ra_data->cmds) {
 810             mainloop_set_trigger(ra_data->work);
 811         }
 812         free_cmd(cmd);
 813     }
 814 }
 815 
 816 static void
 817 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 818 {
 819     remote_ra_data_t *ra_data = NULL;
 820 
 821     pcmk__assert(lrm_state != NULL);
 822     ra_data = lrm_state->remote_ra_data;
 823 
 824     if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 825         /* delete pending ops when ever the remote connection is intentionally stopped */
 826         g_hash_table_remove_all(lrm_state->active_ops);
 827     } else {
 828         /* we no longer hold the history if this connection has been migrated,
 829          * however, we keep metadata cache for future use */
 830         lrm_state_reset_tables(lrm_state, FALSE);
 831     }
 832 
 833     lrm_remote_clear_flags(lrm_state, remote_active);
 834     lrm_state_disconnect(lrm_state);
 835 
 836     if (ra_data->cmds) {
 837         g_list_free_full(ra_data->cmds, free_cmd);
 838     }
 839     if (ra_data->recurring_cmds) {
 840         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 841     }
 842     ra_data->cmds = NULL;
 843     ra_data->recurring_cmds = NULL;
 844     ra_data->cur_cmd = NULL;
 845 
 846     if (cmd) {
 847         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 848         report_remote_ra_result(cmd);
 849     }
 850 }
 851 
 852 // \return Standard Pacemaker return code
 853 static int
 854 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 855 {
 856     const char *server = NULL;
 857     lrmd_key_value_t *tmp = NULL;
 858     int port = 0;
 859     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 860     int rc = pcmk_rc_ok;
 861 
 862     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 863         if (pcmk__strcase_any_of(tmp->key,
 864                                  PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
 865                                  NULL)) {
 866             server = tmp->value;
 867 
 868         } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
 869                                 pcmk__str_none)) {
 870             port = atoi(tmp->value);
 871 
 872         } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
 873                                 pcmk__str_none)) {
 874             lrm_remote_set_flags(lrm_state, controlling_guest);
 875         }
 876     }
 877 
 878     rc = controld_connect_remote_executor(lrm_state, server, port,
 879                                           timeout_used);
 880     if (rc != pcmk_rc_ok) {
 881         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 882                             PCMK_EXEC_ERROR,
 883                             "Could not connect to Pacemaker Remote node %s: %s",
 884                             lrm_state->node_name, pcmk_rc_str(rc));
 885     }
 886     return rc;
 887 }
 888 
 889 static gboolean
 890 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 891 {
 892     int rc = 0;
 893     lrm_state_t *lrm_state = user_data;
 894     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 895     remote_ra_cmd_t *cmd;
 896     GList *first = NULL;
 897 
 898     if (ra_data->cur_cmd) {
 899         /* still waiting on previous cmd */
 900         return TRUE;
 901     }
 902 
 903     while (ra_data->cmds) {
 904         first = ra_data->cmds;
 905         cmd = first->data;
 906         if (cmd->delay_id) {
 907             /* still waiting for start delay timer to trip */
 908             return TRUE;
 909         }
 910 
 911         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 912         g_list_free_1(first);
 913 
 914         if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
 915                              PCMK_ACTION_MIGRATE_FROM, NULL)) {
 916             lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
 917             if (handle_remote_ra_start(lrm_state, cmd,
 918                                        cmd->timeout) == pcmk_rc_ok) {
 919                 /* take care of this later when we get async connection result */
 920                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 921                           cmd->action);
 922                 ra_data->cur_cmd = cmd;
 923                 return TRUE;
 924             }
 925             report_remote_ra_result(cmd);
 926 
 927         } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
 928 
 929             if (lrm_state_is_connected(lrm_state) == TRUE) {
 930                 rc = lrm_state_poke_connection(lrm_state);
 931                 if (rc < 0) {
 932                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 933                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 934                 }
 935             } else {
 936                 rc = -1;
 937                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 938                                  PCMK_EXEC_DONE, "Remote connection inactive");
 939             }
 940 
 941             if (rc == 0) {
 942                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 943                           cmd->rsc_id);
 944                 ra_data->cur_cmd = cmd;
 945                 cmd->monitor_timeout_id = pcmk__create_timer(cmd->timeout, monitor_timeout_cb, cmd);
 946                 return TRUE;
 947             }
 948             report_remote_ra_result(cmd);
 949 
 950         } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
 951 
 952             if (pcmk_is_set(ra_data->status, expect_takeover)) {
 953                 /* briefly wait on stop for the takeover event to occur. If the
 954                  * takeover event does not occur during the wait period, that's fine.
 955                  * It just means that the remote-node's lrm_status section is going to get
 956                  * cleared which will require all the resources running in the remote-node
 957                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 958                  * successfully, then we can leave the status section intact. */
 959                 cmd->takeover_timeout_id = pcmk__create_timer((cmd->timeout/2),
 960                                                               connection_takeover_timeout_cb,
 961                                                               cmd);
 962                 ra_data->cur_cmd = cmd;
 963                 return TRUE;
 964             }
 965 
 966             handle_remote_ra_stop(lrm_state, cmd);
 967 
 968         } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
 969             lrm_remote_clear_flags(lrm_state, takeover_complete);
 970             lrm_remote_set_flags(lrm_state, expect_takeover);
 971             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 972             report_remote_ra_result(cmd);
 973 
 974         } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
 975                                     PCMK_ACTION_RELOAD_AGENT, NULL))  {
 976             /* Currently the only reloadable parameter is
 977              * PCMK_REMOTE_RA_RECONNECT_INTERVAL, which is only used by the
 978              * scheduler via the CIB, so reloads are a no-op.
 979              *
 980              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 981              * we're in a rolling upgrade with a DC scheduling "reload" instead
 982              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 983              * so this would work for that purpose as well.
 984              */
 985             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 986             report_remote_ra_result(cmd);
 987         }
 988 
 989         free_cmd(cmd);
 990     }
 991 
 992     return TRUE;
 993 }
 994 
 995 static void
 996 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
 997 {
 998     remote_ra_data_t *ra_data = NULL;
 999 
1000     if (lrm_state->remote_ra_data) {
1001         return;
1002     }
1003 
1004     ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
1005     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1006     lrm_state->remote_ra_data = ra_data;
1007 }
1008 
1009 void
1010 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1011 {
1012     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1013 
1014     if (!ra_data) {
1015         return;
1016     }
1017 
1018     if (ra_data->cmds) {
1019         g_list_free_full(ra_data->cmds, free_cmd);
1020     }
1021 
1022     if (ra_data->recurring_cmds) {
1023         g_list_free_full(ra_data->recurring_cmds, free_cmd);
1024     }
1025     mainloop_destroy_trigger(ra_data->work);
1026     free(ra_data);
1027     lrm_state->remote_ra_data = NULL;
1028 }
1029 
1030 gboolean
1031 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
1032 {
1033     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1034         return TRUE;
1035     }
1036     return (id != NULL) && (controld_get_executor_state(id, false) != NULL)
1037            && !controld_is_local_node(id);
1038 }
1039 
1040 lrmd_rsc_info_t *
1041 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
1042 {
1043     lrmd_rsc_info_t *info = NULL;
1044 
1045     CRM_CHECK(rsc_id != NULL, return NULL);
1046 
1047     if (controld_get_executor_state(rsc_id, false) != NULL) {
1048         info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
1049 
1050         info->id = pcmk__str_copy(rsc_id);
1051         info->type = pcmk__str_copy(REMOTE_LRMD_RA);
1052         info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
1053         info->provider = pcmk__str_copy("pacemaker");
1054     }
1055 
1056     return info;
1057 }
1058 
1059 static gboolean
1060 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
1061 {
1062     return pcmk__str_any_of(action,
1063                             PCMK_ACTION_START,
1064                             PCMK_ACTION_STOP,
1065                             PCMK_ACTION_MONITOR,
1066                             PCMK_ACTION_MIGRATE_TO,
1067                             PCMK_ACTION_MIGRATE_FROM,
1068                             PCMK_ACTION_RELOAD_AGENT,
1069                             PCMK_ACTION_RELOAD,
1070                             NULL);
1071 }
1072 
1073 static GList *
1074 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1075 {
1076     GList *rm_list = NULL;
1077     remote_ra_cmd_t *cmd = NULL;
1078     GList *gIter = NULL;
1079 
1080     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1081         cmd = gIter->data;
1082         if ((cmd->interval_ms > 0)
1083             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1084                             pcmk__str_casei)) {
1085             rm_list = g_list_append(rm_list, cmd);
1086         }
1087     }
1088 
1089     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1090         cmd = gIter->data;
1091 
1092         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1093                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
1094         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1095                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1096         report_remote_ra_result(cmd);
1097 
1098         list = g_list_remove(list, cmd);
1099         free_cmd(cmd);
1100     }
1101 
1102     /* frees only the list data, not the cmds */
1103     g_list_free(rm_list);
1104     return list;
1105 }
1106 
1107 static GList *
1108 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
1109 {
1110     remote_ra_cmd_t *cmd = NULL;
1111     GList *gIter = NULL;
1112 
1113     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1114         cmd = gIter->data;
1115         if ((cmd->interval_ms == interval_ms)
1116             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1117             break;
1118         }
1119         cmd = NULL;
1120     }
1121     if (cmd) {
1122         list = g_list_remove(list, cmd);
1123         free_cmd(cmd);
1124     }
1125     return list;
1126 }
1127 
1128 int
1129 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1130                  const char *action, guint interval_ms)
1131 {
1132     lrm_state_t *connection_rsc = NULL;
1133     remote_ra_data_t *ra_data = NULL;
1134 
1135     CRM_CHECK(rsc_id != NULL, return -EINVAL);
1136 
1137     connection_rsc = controld_get_executor_state(rsc_id, false);
1138     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1139         return -EINVAL;
1140     }
1141 
1142     ra_data = connection_rsc->remote_ra_data;
1143     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1144     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1145                                          interval_ms);
1146     if (ra_data->cur_cmd &&
1147         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1148         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1149 
1150         cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1151     }
1152 
1153     return 0;
1154 }
1155 
1156 static remote_ra_cmd_t *
1157 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1158                    const char *userdata)
1159 {
1160     GList *gIter = NULL;
1161     remote_ra_cmd_t *cmd = NULL;
1162 
1163     /* there are 3 places a potential duplicate monitor operation
1164      * could exist.
1165      * 1. recurring_cmds list. where the op is waiting for its next interval
1166      * 2. cmds list, where the op is queued to get executed immediately
1167      * 3. cur_cmd, which means the monitor op is in flight right now.
1168      */
1169     if (interval_ms == 0) {
1170         return NULL;
1171     }
1172 
1173     if (ra_data->cur_cmd &&
1174         !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1175         (ra_data->cur_cmd->interval_ms == interval_ms)
1176         && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1177                         pcmk__str_casei)) {
1178 
1179         cmd = ra_data->cur_cmd;
1180         goto handle_dup;
1181     }
1182 
1183     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1184         cmd = gIter->data;
1185         if ((cmd->interval_ms == interval_ms)
1186             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1187                             pcmk__str_casei)) {
1188             goto handle_dup;
1189         }
1190     }
1191 
1192     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1193         cmd = gIter->data;
1194         if ((cmd->interval_ms == interval_ms)
1195             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1196                             pcmk__str_casei)) {
1197             goto handle_dup;
1198         }
1199     }
1200 
1201     return NULL;
1202 
1203 handle_dup:
1204 
1205     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1206               cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1207 
1208     /* update the userdata */
1209     if (userdata) {
1210        free(cmd->userdata);
1211        cmd->userdata = pcmk__str_copy(userdata);
1212     }
1213 
1214     /* if we've already reported success, generate a new call id */
1215     if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1216         cmd->start_time = time(NULL);
1217         cmd->call_id = generate_callid();
1218         cmd_clear_flags(cmd, cmd_reported_success);
1219     }
1220 
1221     /* if we have an interval_id set, that means we are in the process of
1222      * waiting for this cmd's next interval. instead of waiting, cancel
1223      * the timer and execute the action immediately */
1224     if (cmd->interval_id) {
1225         g_source_remove(cmd->interval_id);
1226         cmd->interval_id = 0;
1227         recurring_helper(cmd);
1228     }
1229 
1230     return cmd;
1231 }
1232 
1233 /*!
1234  * \internal
1235  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1236  *
1237  * \param[in]     lrm_state      Executor state object for remote connection
1238  * \param[in]     rsc_id         Connection resource ID
1239  * \param[in]     action         Action to execute
1240  * \param[in]     userdata       String to copy and pass to execution callback
1241  * \param[in]     interval_ms    Action interval (in milliseconds)
1242  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1243  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1244  * \param[in,out] params         Connection resource parameters
1245  * \param[out]    call_id        Where to store call ID on success
1246  *
1247  * \return Standard Pacemaker return code
1248  * \note This takes ownership of \p params, which should not be used or freed
1249  *       after calling this function.
1250  */
1251 int
1252 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1253                               const char *action, const char *userdata,
1254                               guint interval_ms, int timeout_ms,
1255                               int start_delay_ms, lrmd_key_value_t *params,
1256                               int *call_id)
1257 {
1258     lrm_state_t *connection_rsc = NULL;
1259     remote_ra_cmd_t *cmd = NULL;
1260     remote_ra_data_t *ra_data = NULL;
1261 
1262     *call_id = 0;
1263 
1264     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1265               && (userdata != NULL) && (call_id != NULL),
1266               lrmd_key_value_freeall(params); return EINVAL);
1267 
1268     if (!is_remote_ra_supported_action(action)) {
1269         lrmd_key_value_freeall(params);
1270         return EOPNOTSUPP;
1271     }
1272 
1273     connection_rsc = controld_get_executor_state(rsc_id, false);
1274     if (connection_rsc == NULL) {
1275         lrmd_key_value_freeall(params);
1276         return ENOTCONN;
1277     }
1278 
1279     remote_ra_data_init(connection_rsc);
1280     ra_data = connection_rsc->remote_ra_data;
1281 
1282     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1283     if (cmd) {
1284         *call_id = cmd->call_id;
1285         lrmd_key_value_freeall(params);
1286         return pcmk_rc_ok;
1287     }
1288 
1289     cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1290 
1291     cmd->owner = pcmk__str_copy(lrm_state->node_name);
1292     cmd->rsc_id = pcmk__str_copy(rsc_id);
1293     cmd->action = pcmk__str_copy(action);
1294     cmd->userdata = pcmk__str_copy(userdata);
1295     cmd->interval_ms = interval_ms;
1296     cmd->timeout = timeout_ms;
1297     cmd->start_delay = start_delay_ms;
1298     cmd->params = params;
1299     cmd->start_time = time(NULL);
1300 
1301     cmd->call_id = generate_callid();
1302 
1303     if (cmd->start_delay) {
1304         cmd->delay_id = pcmk__create_timer(cmd->start_delay, start_delay_helper, cmd);
1305     }
1306 
1307     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1308     mainloop_set_trigger(ra_data->work);
1309 
1310     *call_id = cmd->call_id;
1311     return pcmk_rc_ok;
1312 }
1313 
1314 /*!
1315  * \internal
1316  * \brief Immediately fail all monitors of a remote node, if proxied here
1317  *
1318  * \param[in] node_name  Name of pacemaker_remote node
1319  */
1320 void
1321 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1322 {
1323     lrm_state_t *lrm_state = NULL;
1324 
1325     CRM_CHECK(node_name != NULL, return);
1326 
1327     lrm_state = controld_get_executor_state(node_name, false);
1328     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1329         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1330 
1331         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1332         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1333         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1334     }
1335 }
1336 
1337 /* A guest node fencing implied by host fencing looks like:
1338  *
1339  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1340  *                on_node="lxc1" on_node_uuid="lxc1">
1341  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1342  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1343  *     <downed>
1344  *       <node id="lxc1"/>
1345  *     </downed>
1346  *  </pseudo_event>
1347  */
1348 #define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1349     "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1350 
1351 /*!
1352  * \internal
1353  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1354  *
1355  * \param[in,out] xml  XML of pseudo-action to check
1356  */
1357 void
1358 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1359 {
1360     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1361 
1362     if (numXpathResults(search) == 1) {
1363         xmlNode *result = getXpathResult(search, 0);
1364 
1365         /* Normally, we handle the necessary side effects of a guest node stop
1366          * action when reporting the remote agent's result. However, if the stop
1367          * is implied due to fencing, it will be a fencing pseudo-event, and
1368          * there won't be a result to report. Handle that case here.
1369          *
1370          * This will result in a duplicate call to remote_node_down() if the
1371          * guest stop was real instead of implied, but that shouldn't hurt.
1372          *
1373          * There is still one corner case that isn't handled: if a guest node
1374          * isn't running any resources when its host is fenced, it will appear
1375          * to be cleanly stopped, so there will be no pseudo-fence, and our
1376          * peer cache state will be incorrect unless and until the guest is
1377          * recovered.
1378          */
1379         if (result) {
1380             const char *remote = pcmk__xe_id(result);
1381 
1382             if (remote) {
1383                 remote_node_down(remote, DOWN_ERASE_LRM);
1384             }
1385         }
1386     }
1387     freeXpathObject(search);
1388 }
1389 
1390 static void
1391 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1392 {
1393     xmlNode *update, *state;
1394     int call_opt;
1395     pcmk__node_status_t *node = NULL;
1396 
1397     call_opt = crmd_cib_smart_opt();
1398     node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1399     CRM_CHECK(node != NULL, return);
1400     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1401     state = create_node_state_update(node, node_update_none, update,
1402                                      __func__);
1403     crm_xml_add(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1404     if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1405                             NULL) == pcmk_rc_ok) {
1406         /* TODO: still not 100% sure that async update will succeed ... */
1407         if (maintenance) {
1408             lrm_remote_set_flags(lrm_state, remote_in_maint);
1409         } else {
1410             lrm_remote_clear_flags(lrm_state, remote_in_maint);
1411         }
1412     }
1413     pcmk__xml_free(update);
1414 }
1415 
1416 #define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT         \
1417     "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1418     PCMK__XE_MAINTENANCE
1419 
1420 /*!
1421  * \internal
1422  * \brief Check a pseudo-action holding updates for maintenance state
1423  *
1424  * \param[in,out] xml  XML of pseudo-action to check
1425  */
1426 void
1427 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1428 {
1429     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1430 
1431     if (numXpathResults(search) == 1) {
1432         xmlNode *node;
1433         int cnt = 0, cnt_remote = 0;
1434 
1435         for (node = pcmk__xe_first_child(getXpathResult(search, 0),
1436                                          PCMK_XE_NODE, NULL, NULL);
1437              node != NULL; node = pcmk__xe_next(node, PCMK_XE_NODE)) {
1438 
1439             lrm_state_t *lrm_state = NULL;
1440             const char *id = pcmk__xe_id(node);
1441 
1442             cnt++;
1443             if (id == NULL) {
1444                 continue; // Shouldn't be possible
1445             }
1446 
1447             lrm_state = controld_get_executor_state(id, false);
1448 
1449             if (lrm_state && lrm_state->remote_ra_data &&
1450                 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1451 
1452                 const char *in_maint_s = NULL;
1453                 int in_maint;
1454 
1455                 cnt_remote++;
1456                 in_maint_s = crm_element_value(node,
1457                                                PCMK__XA_NODE_IN_MAINTENANCE);
1458                 pcmk__scan_min_int(in_maint_s, &in_maint, 0);
1459                 remote_ra_maintenance(lrm_state, in_maint);
1460             }
1461         }
1462         crm_trace("Action holds %d nodes (%d remotes found) adjusting "
1463                   PCMK_OPT_MAINTENANCE_MODE,
1464                   cnt, cnt_remote);
1465     }
1466     freeXpathObject(search);
1467 }
1468 
1469 gboolean
1470 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1471 {
1472     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1473     return pcmk_is_set(ra_data->status, remote_in_maint);
1474 }
1475 
1476 gboolean
1477 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1478 {
1479     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1480     return pcmk_is_set(ra_data->status, controlling_guest);
1481 }

/* [previous][next][first][last][top][bottom][index][help] */