root/daemons/controld/controld_remote_ra.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. free_cmd
  2. generate_callid
  3. recurring_helper
  4. start_delay_helper
  5. should_purge_attributes
  6. section_to_delete
  7. purge_remote_node_attrs
  8. remote_node_up
  9. remote_node_down
  10. check_remote_node_state
  11. report_remote_ra_result
  12. update_remaining_timeout
  13. retry_start_cmd_cb
  14. connection_takeover_timeout_cb
  15. monitor_timeout_cb
  16. synthesize_lrmd_success
  17. remote_lrm_op_callback
  18. handle_remote_ra_stop
  19. handle_remote_ra_start
  20. handle_remote_ra_exec
  21. remote_ra_data_init
  22. remote_ra_cleanup
  23. is_remote_lrmd_ra
  24. remote_ra_get_rsc_info
  25. is_remote_ra_supported_action
  26. fail_all_monitor_cmds
  27. remove_cmd
  28. remote_ra_cancel
  29. handle_dup_monitor
  30. controld_execute_remote_agent
  31. remote_ra_fail
  32. remote_ra_process_pseudo
  33. remote_ra_maintenance
  34. remote_ra_process_maintenance_nodes
  35. remote_ra_is_in_maintenance
  36. remote_ra_controlling_guest

   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 #define cmd_set_flags(cmd, flags_to_set) do { \
  27     (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
  28                                        "Remote command", (cmd)->rsc_id, (cmd)->status, \
  29                                        (flags_to_set), #flags_to_set); \
  30         } while (0)
  31 
  32 #define cmd_clear_flags(cmd, flags_to_clear) do { \
  33     (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
  34                                          "Remote command", (cmd)->rsc_id, (cmd)->status, \
  35                                          (flags_to_clear), #flags_to_clear); \
  36         } while (0)
  37 
  38 enum remote_cmd_status {
  39     cmd_reported_success    = (1 << 0),
  40     cmd_cancel              = (1 << 1),
  41 };
  42 
  43 typedef struct remote_ra_cmd_s {
  44     /*! the local node the cmd is issued from */
  45     char *owner;
  46     /*! the remote node the cmd is executed on */
  47     char *rsc_id;
  48     /*! the action to execute */
  49     char *action;
  50     /*! some string the client wants us to give it back */
  51     char *userdata;
  52     /*! start delay in ms */
  53     int start_delay;
  54     /*! timer id used for start delay. */
  55     int delay_id;
  56     /*! timeout in ms for cmd */
  57     int timeout;
  58     int remaining_timeout;
  59     /*! recurring interval in ms */
  60     guint interval_ms;
  61     /*! interval timer id */
  62     int interval_id;
  63     int monitor_timeout_id;
  64     int takeover_timeout_id;
  65     /*! action parameters */
  66     lrmd_key_value_t *params;
  67     pcmk__action_result_t result;
  68     int call_id;
  69     time_t start_time;
  70     uint32_t status;
  71 } remote_ra_cmd_t;
  72 
  73 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
  74     lrm_state_t *lrm = (lrm_state); \
  75     remote_ra_data_t *ra = lrm->remote_ra_data; \
  76     ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  77                                     lrm->node_name, ra->status, \
  78                                     (flags_to_set), #flags_to_set); \
  79         } while (0)
  80 
  81 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
  82     lrm_state_t *lrm = (lrm_state); \
  83     remote_ra_data_t *ra = lrm->remote_ra_data; \
  84     ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  85                                       lrm->node_name, ra->status, \
  86                                       (flags_to_clear), #flags_to_clear); \
  87         } while (0)
  88 
  89 enum remote_status {
  90     expect_takeover     = (1 << 0),
  91     takeover_complete   = (1 << 1),
  92     remote_active       = (1 << 2),
  93     /* Maintenance mode is difficult to determine from the controller's context,
  94      * so we have it signalled back with the transition from the scheduler.
  95      */
  96     remote_in_maint     = (1 << 3),
  97     /* Similar for whether we are controlling a guest node or remote node.
  98      * Fortunately there is a meta-attribute in the transition already and
  99      * as the situation doesn't change over time we can use the
 100      * resource start for noting down the information for later use when
 101      * the attributes aren't at hand.
 102      */
 103     controlling_guest   = (1 << 4),
 104 };
 105 
 106 typedef struct remote_ra_data_s {
 107     crm_trigger_t *work;
 108     remote_ra_cmd_t *cur_cmd;
 109     GList *cmds;
 110     GList *recurring_cmds;
 111     uint32_t status;
 112 } remote_ra_data_t;
 113 
 114 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 115 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 116 static GList *fail_all_monitor_cmds(GList * list);
 117 
 118 static void
 119 free_cmd(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 120 {
 121     remote_ra_cmd_t *cmd = user_data;
 122 
 123     if (!cmd) {
 124         return;
 125     }
 126     if (cmd->delay_id) {
 127         g_source_remove(cmd->delay_id);
 128     }
 129     if (cmd->interval_id) {
 130         g_source_remove(cmd->interval_id);
 131     }
 132     if (cmd->monitor_timeout_id) {
 133         g_source_remove(cmd->monitor_timeout_id);
 134     }
 135     if (cmd->takeover_timeout_id) {
 136         g_source_remove(cmd->takeover_timeout_id);
 137     }
 138     free(cmd->owner);
 139     free(cmd->rsc_id);
 140     free(cmd->action);
 141     free(cmd->userdata);
 142     pcmk__reset_result(&(cmd->result));
 143     lrmd_key_value_freeall(cmd->params);
 144     free(cmd);
 145 }
 146 
 147 static int
 148 generate_callid(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 149 {
 150     static int remote_ra_callid = 0;
 151 
 152     remote_ra_callid++;
 153     if (remote_ra_callid <= 0) {
 154         remote_ra_callid = 1;
 155     }
 156 
 157     return remote_ra_callid;
 158 }
 159 
 160 static gboolean
 161 recurring_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 162 {
 163     remote_ra_cmd_t *cmd = data;
 164     lrm_state_t *connection_rsc = NULL;
 165 
 166     cmd->interval_id = 0;
 167     connection_rsc = lrm_state_find(cmd->rsc_id);
 168     if (connection_rsc && connection_rsc->remote_ra_data) {
 169         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 170 
 171         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 172 
 173         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 174         mainloop_set_trigger(ra_data->work);
 175     }
 176     return FALSE;
 177 }
 178 
 179 static gboolean
 180 start_delay_helper(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 181 {
 182     remote_ra_cmd_t *cmd = data;
 183     lrm_state_t *connection_rsc = NULL;
 184 
 185     cmd->delay_id = 0;
 186     connection_rsc = lrm_state_find(cmd->rsc_id);
 187     if (connection_rsc && connection_rsc->remote_ra_data) {
 188         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 189 
 190         mainloop_set_trigger(ra_data->work);
 191     }
 192     return FALSE;
 193 }
 194 
 195 static bool
 196 should_purge_attributes(crm_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 197 {
 198     bool purge = true;
 199     crm_node_t *conn_node = NULL;
 200     lrm_state_t *connection_rsc = NULL;
 201 
 202     if (!node->conn_host) {
 203         return purge;
 204     }
 205 
 206     /* Get the node that was hosting the remote connection resource from the
 207      * peer cache.  That's the one we really care about here.
 208      */
 209     conn_node = pcmk__get_node(0, node->conn_host, NULL,
 210                                pcmk__node_search_cluster_member);
 211     if (conn_node == NULL) {
 212         return purge;
 213     }
 214 
 215     /* Check the uptime of connection_rsc.  If it hasn't been running long
 216      * enough, set purge=true.  "Long enough" means it started running earlier
 217      * than the timestamp when we noticed it went away in the first place.
 218      */
 219     connection_rsc = lrm_state_find(node->uname);
 220 
 221     if (connection_rsc != NULL) {
 222         lrmd_t *lrm = connection_rsc->conn;
 223         time_t uptime = lrmd__uptime(lrm);
 224         time_t now = time(NULL);
 225 
 226         /* Add 20s of fuzziness to give corosync a while to notice the remote
 227          * host is gone.  On various error conditions (failure to get uptime,
 228          * peer_lost isn't set) we default to purging.
 229          */
 230         if (uptime > 0 &&
 231             conn_node->peer_lost > 0 &&
 232             uptime + 20 >= now - conn_node->peer_lost) {
 233             purge = false;
 234         }
 235     }
 236 
 237     return purge;
 238 }
 239 
 240 static enum controld_section_e
 241 section_to_delete(bool purge)
     /* [previous][next][first][last][top][bottom][index][help] */
 242 {
 243     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 244         if (purge) {
 245             return controld_section_all_unlocked;
 246         } else {
 247             return controld_section_lrm_unlocked;
 248         }
 249     } else {
 250         if (purge) {
 251             return controld_section_all;
 252         } else {
 253             return controld_section_lrm;
 254         }
 255     }
 256 }
 257 
 258 static void
 259 purge_remote_node_attrs(int call_opt, crm_node_t *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 260 {
 261     bool purge = should_purge_attributes(node);
 262     enum controld_section_e section = section_to_delete(purge);
 263 
 264     /* Purge node from attrd's memory */
 265     if (purge) {
 266         update_attrd_remote_node_removed(node->uname, NULL);
 267     }
 268 
 269     controld_delete_node_state(node->uname, section, call_opt);
 270 }
 271 
 272 /*!
 273  * \internal
 274  * \brief Handle cluster communication related to pacemaker_remote node joining
 275  *
 276  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 277  */
 278 static void
 279 remote_node_up(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
 280 {
 281     int call_opt;
 282     xmlNode *update, *state;
 283     crm_node_t *node;
 284     lrm_state_t *connection_rsc = NULL;
 285 
 286     CRM_CHECK(node_name != NULL, return);
 287     crm_info("Announcing Pacemaker Remote node %s", node_name);
 288 
 289     call_opt = crmd_cib_smart_opt();
 290 
 291     /* Delete node's probe_complete attribute. This serves two purposes:
 292      *
 293      * - @COMPAT DCs < 1.1.14 in a rolling upgrade might use it
 294      * - deleting it (or any attribute for that matter) here ensures the
 295      *   attribute manager learns the node is remote
 296      */
 297     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 298 
 299     /* Ensure node is in the remote peer cache with member status */
 300     node = pcmk__cluster_lookup_remote_node(node_name);
 301     CRM_CHECK(node != NULL, return);
 302 
 303     purge_remote_node_attrs(call_opt, node);
 304     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 305 
 306     /* Apply any start state that we were given from the environment on the
 307      * remote node.
 308      */
 309     connection_rsc = lrm_state_find(node->uname);
 310 
 311     if (connection_rsc != NULL) {
 312         lrmd_t *lrm = connection_rsc->conn;
 313         const char *start_state = lrmd__node_start_state(lrm);
 314 
 315         if (start_state) {
 316             set_join_state(start_state, node->uname, node->uuid, true);
 317         }
 318     }
 319 
 320     /* pacemaker_remote nodes don't participate in the membership layer,
 321      * so cluster nodes don't automatically get notified when they come and go.
 322      * We send a cluster message to the DC, and update the CIB node state entry,
 323      * so the DC will get it sooner (via message) or later (via CIB refresh),
 324      * and any other interested parties can query the CIB.
 325      */
 326     broadcast_remote_state_message(node_name, true);
 327 
 328     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 329     state = create_node_state_update(node, node_update_cluster, update,
 330                                      __func__);
 331 
 332     /* Clear the PCMK__XA_NODE_FENCED flag in the node state. If the node ever
 333      * needs to be fenced, this flag will allow various actions to determine
 334      * whether the fencing has happened yet.
 335      */
 336     crm_xml_add(state, PCMK__XA_NODE_FENCED, "0");
 337 
 338     /* TODO: If the remote connection drops, and this (async) CIB update either
 339      * failed or has not yet completed, later actions could mistakenly think the
 340      * node has already been fenced (if the PCMK__XA_NODE_FENCED attribute was
 341      * previously set, because it won't have been cleared). This could prevent
 342      * actual fencing or allow recurring monitor failures to be cleared too
 343      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 344      */
 345     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 346     free_xml(update);
 347 }
 348 
 349 enum down_opts {
 350     DOWN_KEEP_LRM,
 351     DOWN_ERASE_LRM
 352 };
 353 
 354 /*!
 355  * \internal
 356  * \brief Handle cluster communication related to pacemaker_remote node leaving
 357  *
 358  * \param[in] node_name  Name of lost node
 359  * \param[in] opts       Whether to keep or erase LRM history
 360  */
 361 static void
 362 remote_node_down(const char *node_name, const enum down_opts opts)
     /* [previous][next][first][last][top][bottom][index][help] */
 363 {
 364     xmlNode *update;
 365     int call_opt = crmd_cib_smart_opt();
 366     crm_node_t *node;
 367 
 368     /* Purge node from attrd's memory */
 369     update_attrd_remote_node_removed(node_name, NULL);
 370 
 371     /* Normally, only node attributes should be erased, and the resource history
 372      * should be kept until the node comes back up. However, after a successful
 373      * fence, we want to clear the history as well, so we don't think resources
 374      * are still running on the node.
 375      */
 376     if (opts == DOWN_ERASE_LRM) {
 377         controld_delete_node_state(node_name, controld_section_all, call_opt);
 378     } else {
 379         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 380     }
 381 
 382     /* Ensure node is in the remote peer cache with lost state */
 383     node = pcmk__cluster_lookup_remote_node(node_name);
 384     CRM_CHECK(node != NULL, return);
 385     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 386 
 387     /* Notify DC */
 388     broadcast_remote_state_message(node_name, false);
 389 
 390     /* Update CIB node state */
 391     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 392     create_node_state_update(node, node_update_cluster, update, __func__);
 393     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 394     free_xml(update);
 395 }
 396 
 397 /*!
 398  * \internal
 399  * \brief Handle effects of a remote RA command on node state
 400  *
 401  * \param[in] cmd  Completed remote RA command
 402  */
 403 static void
 404 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 405 {
 406     /* Only successful actions can change node state */
 407     if (!pcmk__result_ok(&(cmd->result))) {
 408         return;
 409     }
 410 
 411     if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
 412         remote_node_up(cmd->rsc_id);
 413 
 414     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
 415                             pcmk__str_casei)) {
 416         /* After a successful migration, we don't need to do remote_node_up()
 417          * because the DC already knows the node is up, and we don't want to
 418          * clear LRM history etc. We do need to add the remote node to this
 419          * host's remote peer cache, because (unless it happens to be DC)
 420          * it hasn't been tracking the remote node, and other code relies on
 421          * the cache to distinguish remote nodes from unseen cluster nodes.
 422          */
 423         crm_node_t *node = pcmk__cluster_lookup_remote_node(cmd->rsc_id);
 424 
 425         CRM_CHECK(node != NULL, return);
 426         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 427 
 428     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
 429         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 430         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 431 
 432         if (ra_data) {
 433             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 434                 /* Stop means down if we didn't successfully migrate elsewhere */
 435                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 436             } else if (AM_I_DC == FALSE) {
 437                 /* Only the connection host and DC track node state,
 438                  * so if the connection migrated elsewhere and we aren't DC,
 439                  * un-cache the node, so we don't have stale info
 440                  */
 441                 pcmk__cluster_forget_remote_node(cmd->rsc_id);
 442             }
 443         }
 444     }
 445 
 446     /* We don't do anything for successful monitors, which is correct for
 447      * routine recurring monitors, and for monitors on nodes where the
 448      * connection isn't supposed to be (the cluster will stop the connection in
 449      * that case). However, if the initial probe finds the connection already
 450      * active on the node where we want it, we probably should do
 451      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 452      * Given that connections have to be initiated by the cluster, the chance of
 453      * that should be close to zero.
 454      */
 455 }
 456 
 457 static void
 458 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 459 {
 460     lrmd_event_data_t op = { 0, };
 461 
 462     check_remote_node_state(cmd);
 463 
 464     op.type = lrmd_event_exec_complete;
 465     op.rsc_id = cmd->rsc_id;
 466     op.op_type = cmd->action;
 467     op.user_data = cmd->userdata;
 468     op.timeout = cmd->timeout;
 469     op.interval_ms = cmd->interval_ms;
 470     // coverity[store_truncates_time_t]
 471     op.t_run = (unsigned int) cmd->start_time;
 472     // coverity[store_truncates_time_t]
 473     op.t_rcchange = (unsigned int) cmd->start_time;
 474 
 475     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 476                      cmd->result.exit_reason);
 477 
 478     if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
 479         // coverity[store_truncates_time_t]
 480         op.t_rcchange = (unsigned int) time(NULL);
 481         /* This edge case will likely never ever occur, but if it does the
 482          * result is that a failure will not be processed correctly. This is only
 483          * remotely possible because we are able to detect a connection resource's tcp
 484          * connection has failed at any moment after start has completed. The actual
 485          * recurring operation is just a connectivity ping.
 486          *
 487          * basically, we are not guaranteed that the first successful monitor op and
 488          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 489          * make it look like the operations occurred at separate times though. */
 490         if (op.t_rcchange == op.t_run) {
 491             op.t_rcchange++;
 492         }
 493     }
 494 
 495     if (cmd->params) {
 496         lrmd_key_value_t *tmp;
 497 
 498         op.params = pcmk__strkey_table(free, free);
 499         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 500             pcmk__insert_dup(op.params, tmp->key, tmp->value);
 501         }
 502 
 503     }
 504     op.call_id = cmd->call_id;
 505     op.remote_nodename = cmd->owner;
 506 
 507     lrm_op_callback(&op);
 508 
 509     if (op.params) {
 510         g_hash_table_destroy(op.params);
 511     }
 512     lrmd__reset_result(&op);
 513 }
 514 
 515 static void
 516 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 517 {
 518     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 519 }
 520 
 521 static gboolean
 522 retry_start_cmd_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 523 {
 524     lrm_state_t *lrm_state = data;
 525     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 526     remote_ra_cmd_t *cmd = NULL;
 527     int rc = ETIME;
 528 
 529     if (!ra_data || !ra_data->cur_cmd) {
 530         return FALSE;
 531     }
 532     cmd = ra_data->cur_cmd;
 533     if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 534                               PCMK_ACTION_MIGRATE_FROM, NULL)) {
 535         return FALSE;
 536     }
 537     update_remaining_timeout(cmd);
 538 
 539     if (cmd->remaining_timeout > 0) {
 540         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 541     } else {
 542         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 543                          PCMK_EXEC_TIMEOUT,
 544                          "Not enough time remains to retry remote connection");
 545     }
 546 
 547     if (rc != pcmk_rc_ok) {
 548         report_remote_ra_result(cmd);
 549 
 550         if (ra_data->cmds) {
 551             mainloop_set_trigger(ra_data->work);
 552         }
 553         ra_data->cur_cmd = NULL;
 554         free_cmd(cmd);
 555     } else {
 556         /* wait for connection event */
 557     }
 558 
 559     return FALSE;
 560 }
 561 
 562 
 563 static gboolean
 564 connection_takeover_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 565 {
 566     lrm_state_t *lrm_state = NULL;
 567     remote_ra_cmd_t *cmd = data;
 568 
 569     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 570     cmd->takeover_timeout_id = 0;
 571 
 572     lrm_state = lrm_state_find(cmd->rsc_id);
 573 
 574     handle_remote_ra_stop(lrm_state, cmd);
 575     free_cmd(cmd);
 576 
 577     return FALSE;
 578 }
 579 
 580 static gboolean
 581 monitor_timeout_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 582 {
 583     lrm_state_t *lrm_state = NULL;
 584     remote_ra_cmd_t *cmd = data;
 585 
 586     lrm_state = lrm_state_find(cmd->rsc_id);
 587 
 588     crm_info("Timed out waiting for remote poke response from %s%s",
 589              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 590     cmd->monitor_timeout_id = 0;
 591     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 592                      "Remote executor did not respond");
 593 
 594     if (lrm_state && lrm_state->remote_ra_data) {
 595         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 596 
 597         if (ra_data->cur_cmd == cmd) {
 598             ra_data->cur_cmd = NULL;
 599         }
 600         if (ra_data->cmds) {
 601             mainloop_set_trigger(ra_data->work);
 602         }
 603     }
 604 
 605     report_remote_ra_result(cmd);
 606     free_cmd(cmd);
 607 
 608     if(lrm_state) {
 609         lrm_state_disconnect(lrm_state);
 610     }
 611     return FALSE;
 612 }
 613 
 614 static void
 615 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /* [previous][next][first][last][top][bottom][index][help] */
 616 {
 617     lrmd_event_data_t op = { 0, };
 618 
 619     if (lrm_state == NULL) {
 620         /* if lrm_state not given assume local */
 621         lrm_state = lrm_state_find(controld_globals.our_nodename);
 622     }
 623     pcmk__assert(lrm_state != NULL);
 624 
 625     op.type = lrmd_event_exec_complete;
 626     op.rsc_id = rsc_id;
 627     op.op_type = op_type;
 628     // coverity[store_truncates_time_t]
 629     op.t_run = (unsigned int) time(NULL);
 630     op.t_rcchange = op.t_run;
 631     op.call_id = generate_callid();
 632     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 633     process_lrm_event(lrm_state, &op, NULL, NULL);
 634 }
 635 
 636 void
 637 remote_lrm_op_callback(lrmd_event_data_t * op)
     /* [previous][next][first][last][top][bottom][index][help] */
 638 {
 639     gboolean cmd_handled = FALSE;
 640     lrm_state_t *lrm_state = NULL;
 641     remote_ra_data_t *ra_data = NULL;
 642     remote_ra_cmd_t *cmd = NULL;
 643 
 644     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 645               "(%d) status=%s (%d)",
 646               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 647               lrmd_event_type2str(op->type), op->remote_nodename,
 648               services_ocf_exitcode_str(op->rc), op->rc,
 649               pcmk_exec_status_str(op->op_status), op->op_status);
 650 
 651     lrm_state = lrm_state_find(op->remote_nodename);
 652     if (!lrm_state || !lrm_state->remote_ra_data) {
 653         crm_debug("No state information found for remote connection event");
 654         return;
 655     }
 656     ra_data = lrm_state->remote_ra_data;
 657 
 658     if (op->type == lrmd_event_new_client) {
 659         // Another client has connected to the remote daemon
 660 
 661         if (pcmk_is_set(ra_data->status, expect_takeover)) {
 662             // Great, we knew this was coming
 663             lrm_remote_clear_flags(lrm_state, expect_takeover);
 664             lrm_remote_set_flags(lrm_state, takeover_complete);
 665 
 666         } else {
 667             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 668                     "unexpected client takeover", op->remote_nodename);
 669             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 670             /* Do not free lrm_state->conn yet. */
 671             /* It'll be freed in the following stop action. */
 672             lrm_state_disconnect_only(lrm_state);
 673         }
 674         return;
 675     }
 676 
 677     /* filter all EXEC events up */
 678     if (op->type == lrmd_event_exec_complete) {
 679         if (pcmk_is_set(ra_data->status, takeover_complete)) {
 680             crm_debug("ignoring event, this connection is taken over by another node");
 681         } else {
 682             lrm_op_callback(op);
 683         }
 684         return;
 685     }
 686 
 687     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 688 
 689         if (!pcmk_is_set(ra_data->status, remote_active)) {
 690             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 691                       lrm_state->node_name);
 692 
 693         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 694             crm_err("Lost connection to Pacemaker Remote node %s",
 695                     lrm_state->node_name);
 696             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 697             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 698 
 699         } else {
 700             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 701                        lrm_state->node_name);
 702             /* Do roughly what a 'stop' on the remote-resource would do */
 703             handle_remote_ra_stop(lrm_state, NULL);
 704             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 705             /* now fake the reply of a successful 'stop' */
 706             synthesize_lrmd_success(NULL, lrm_state->node_name,
 707                                     PCMK_ACTION_STOP);
 708         }
 709         return;
 710     }
 711 
 712     if (!ra_data->cur_cmd) {
 713         crm_debug("no event to match");
 714         return;
 715     }
 716 
 717     cmd = ra_data->cur_cmd;
 718 
 719     /* Start actions and migrate from actions complete after connection
 720      * comes back to us. */
 721     if ((op->type == lrmd_event_connect)
 722         && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 723                                 PCMK_ACTION_MIGRATE_FROM, NULL)) {
 724         if (op->connection_rc < 0) {
 725             update_remaining_timeout(cmd);
 726 
 727             if ((op->connection_rc == -ENOKEY)
 728                 || (op->connection_rc == -EKEYREJECTED)) {
 729                 // Hard error, don't retry
 730                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 731                                  PCMK_EXEC_ERROR,
 732                                  pcmk_strerror(op->connection_rc));
 733 
 734             } else if (cmd->remaining_timeout > 3000) {
 735                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 736                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 737                 return;
 738 
 739             } else {
 740                 crm_trace("can't reschedule start, remaining timeout too small %d",
 741                           cmd->remaining_timeout);
 742                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 743                                     PCMK_EXEC_TIMEOUT,
 744                                     "%s without enough time to retry",
 745                                     pcmk_strerror(op->connection_rc));
 746             }
 747 
 748         } else {
 749             lrm_state_reset_tables(lrm_state, TRUE);
 750             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 751             lrm_remote_set_flags(lrm_state, remote_active);
 752         }
 753 
 754         crm_debug("Remote connection event matched %s action", cmd->action);
 755         report_remote_ra_result(cmd);
 756         cmd_handled = TRUE;
 757 
 758     } else if ((op->type == lrmd_event_poke)
 759                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 760                                pcmk__str_casei)) {
 761 
 762         if (cmd->monitor_timeout_id) {
 763             g_source_remove(cmd->monitor_timeout_id);
 764             cmd->monitor_timeout_id = 0;
 765         }
 766 
 767         /* Only report success the first time, after that only worry about failures.
 768          * For this function, if we get the poke pack, it is always a success. Pokes
 769          * only fail if the send fails, or the response times out. */
 770         if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
 771             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 772             report_remote_ra_result(cmd);
 773             cmd_set_flags(cmd, cmd_reported_success);
 774         }
 775 
 776         crm_debug("Remote poke event matched %s action", cmd->action);
 777 
 778         /* success, keep rescheduling if interval is present. */
 779         if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
 780             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 781             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 782                                              recurring_helper, cmd);
 783             cmd = NULL;         /* prevent free */
 784         }
 785         cmd_handled = TRUE;
 786 
 787     } else if ((op->type == lrmd_event_disconnect)
 788                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 789                                pcmk__str_casei)) {
 790         if (pcmk_is_set(ra_data->status, remote_active) &&
 791             !pcmk_is_set(cmd->status, cmd_cancel)) {
 792             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 793                              PCMK_EXEC_ERROR,
 794                              "Remote connection unexpectedly dropped "
 795                              "during monitor");
 796             report_remote_ra_result(cmd);
 797             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 798                     lrm_state->node_name);
 799         }
 800         cmd_handled = TRUE;
 801 
 802     } else if ((op->type == lrmd_event_new_client)
 803                && pcmk__str_eq(cmd->action, PCMK_ACTION_STOP,
 804                                pcmk__str_casei)) {
 805 
 806         handle_remote_ra_stop(lrm_state, cmd);
 807         cmd_handled = TRUE;
 808 
 809     } else {
 810         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 811     }
 812 
 813     if (cmd_handled) {
 814         ra_data->cur_cmd = NULL;
 815         if (ra_data->cmds) {
 816             mainloop_set_trigger(ra_data->work);
 817         }
 818         free_cmd(cmd);
 819     }
 820 }
 821 
 822 static void
 823 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /* [previous][next][first][last][top][bottom][index][help] */
 824 {
 825     remote_ra_data_t *ra_data = NULL;
 826 
 827     pcmk__assert(lrm_state != NULL);
 828     ra_data = lrm_state->remote_ra_data;
 829 
 830     if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 831         /* delete pending ops when ever the remote connection is intentionally stopped */
 832         g_hash_table_remove_all(lrm_state->active_ops);
 833     } else {
 834         /* we no longer hold the history if this connection has been migrated,
 835          * however, we keep metadata cache for future use */
 836         lrm_state_reset_tables(lrm_state, FALSE);
 837     }
 838 
 839     lrm_remote_clear_flags(lrm_state, remote_active);
 840     lrm_state_disconnect(lrm_state);
 841 
 842     if (ra_data->cmds) {
 843         g_list_free_full(ra_data->cmds, free_cmd);
 844     }
 845     if (ra_data->recurring_cmds) {
 846         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 847     }
 848     ra_data->cmds = NULL;
 849     ra_data->recurring_cmds = NULL;
 850     ra_data->cur_cmd = NULL;
 851 
 852     if (cmd) {
 853         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 854         report_remote_ra_result(cmd);
 855     }
 856 }
 857 
 858 // \return Standard Pacemaker return code
 859 static int
 860 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
 861 {
 862     const char *server = NULL;
 863     lrmd_key_value_t *tmp = NULL;
 864     int port = 0;
 865     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 866     int rc = pcmk_rc_ok;
 867 
 868     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 869         if (pcmk__strcase_any_of(tmp->key,
 870                                  PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
 871                                  NULL)) {
 872             server = tmp->value;
 873 
 874         } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
 875                                 pcmk__str_none)) {
 876             port = atoi(tmp->value);
 877 
 878         } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
 879                                 pcmk__str_none)) {
 880             lrm_remote_set_flags(lrm_state, controlling_guest);
 881         }
 882     }
 883 
 884     rc = controld_connect_remote_executor(lrm_state, server, port,
 885                                           timeout_used);
 886     if (rc != pcmk_rc_ok) {
 887         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 888                             PCMK_EXEC_ERROR,
 889                             "Could not connect to Pacemaker Remote node %s: %s",
 890                             lrm_state->node_name, pcmk_rc_str(rc));
 891     }
 892     return rc;
 893 }
 894 
 895 static gboolean
 896 handle_remote_ra_exec(gpointer user_data)
     /* [previous][next][first][last][top][bottom][index][help] */
 897 {
 898     int rc = 0;
 899     lrm_state_t *lrm_state = user_data;
 900     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 901     remote_ra_cmd_t *cmd;
 902     GList *first = NULL;
 903 
 904     if (ra_data->cur_cmd) {
 905         /* still waiting on previous cmd */
 906         return TRUE;
 907     }
 908 
 909     while (ra_data->cmds) {
 910         first = ra_data->cmds;
 911         cmd = first->data;
 912         if (cmd->delay_id) {
 913             /* still waiting for start delay timer to trip */
 914             return TRUE;
 915         }
 916 
 917         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 918         g_list_free_1(first);
 919 
 920         if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
 921                              PCMK_ACTION_MIGRATE_FROM, NULL)) {
 922             lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
 923             if (handle_remote_ra_start(lrm_state, cmd,
 924                                        cmd->timeout) == pcmk_rc_ok) {
 925                 /* take care of this later when we get async connection result */
 926                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 927                           cmd->action);
 928                 ra_data->cur_cmd = cmd;
 929                 return TRUE;
 930             }
 931             report_remote_ra_result(cmd);
 932 
 933         } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
 934 
 935             if (lrm_state_is_connected(lrm_state) == TRUE) {
 936                 rc = lrm_state_poke_connection(lrm_state);
 937                 if (rc < 0) {
 938                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 939                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 940                 }
 941             } else {
 942                 rc = -1;
 943                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 944                                  PCMK_EXEC_DONE, "Remote connection inactive");
 945             }
 946 
 947             if (rc == 0) {
 948                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 949                           cmd->rsc_id);
 950                 ra_data->cur_cmd = cmd;
 951                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 952                 return TRUE;
 953             }
 954             report_remote_ra_result(cmd);
 955 
 956         } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
 957 
 958             if (pcmk_is_set(ra_data->status, expect_takeover)) {
 959                 /* briefly wait on stop for the takeover event to occur. If the
 960                  * takeover event does not occur during the wait period, that's fine.
 961                  * It just means that the remote-node's lrm_status section is going to get
 962                  * cleared which will require all the resources running in the remote-node
 963                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 964                  * successfully, then we can leave the status section intact. */
 965                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 966                 ra_data->cur_cmd = cmd;
 967                 return TRUE;
 968             }
 969 
 970             handle_remote_ra_stop(lrm_state, cmd);
 971 
 972         } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
 973             lrm_remote_clear_flags(lrm_state, takeover_complete);
 974             lrm_remote_set_flags(lrm_state, expect_takeover);
 975             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 976             report_remote_ra_result(cmd);
 977 
 978         } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
 979                                     PCMK_ACTION_RELOAD_AGENT, NULL))  {
 980             /* Currently the only reloadable parameter is
 981              * PCMK_REMOTE_RA_RECONNECT_INTERVAL, which is only used by the
 982              * scheduler via the CIB, so reloads are a no-op.
 983              *
 984              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 985              * we're in a rolling upgrade with a DC scheduling "reload" instead
 986              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 987              * so this would work for that purpose as well.
 988              */
 989             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 990             report_remote_ra_result(cmd);
 991         }
 992 
 993         free_cmd(cmd);
 994     }
 995 
 996     return TRUE;
 997 }
 998 
 999 static void
1000 remote_ra_data_init(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1001 {
1002     remote_ra_data_t *ra_data = NULL;
1003 
1004     if (lrm_state->remote_ra_data) {
1005         return;
1006     }
1007 
1008     ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
1009     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1010     lrm_state->remote_ra_data = ra_data;
1011 }
1012 
1013 void
1014 remote_ra_cleanup(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1015 {
1016     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1017 
1018     if (!ra_data) {
1019         return;
1020     }
1021 
1022     if (ra_data->cmds) {
1023         g_list_free_full(ra_data->cmds, free_cmd);
1024     }
1025 
1026     if (ra_data->recurring_cmds) {
1027         g_list_free_full(ra_data->recurring_cmds, free_cmd);
1028     }
1029     mainloop_destroy_trigger(ra_data->work);
1030     free(ra_data);
1031     lrm_state->remote_ra_data = NULL;
1032 }
1033 
1034 gboolean
1035 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /* [previous][next][first][last][top][bottom][index][help] */
1036 {
1037     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1038         return TRUE;
1039     }
1040     if ((id != NULL) && (lrm_state_find(id) != NULL)
1041         && !pcmk__str_eq(id, controld_globals.our_nodename, pcmk__str_casei)) {
1042         return TRUE;
1043     }
1044 
1045     return FALSE;
1046 }
1047 
1048 lrmd_rsc_info_t *
1049 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /* [previous][next][first][last][top][bottom][index][help] */
1050 {
1051     lrmd_rsc_info_t *info = NULL;
1052 
1053     if ((lrm_state_find(rsc_id))) {
1054         info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
1055 
1056         info->id = pcmk__str_copy(rsc_id);
1057         info->type = pcmk__str_copy(REMOTE_LRMD_RA);
1058         info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
1059         info->provider = pcmk__str_copy("pacemaker");
1060     }
1061 
1062     return info;
1063 }
1064 
1065 static gboolean
1066 is_remote_ra_supported_action(const char *action)
     /* [previous][next][first][last][top][bottom][index][help] */
1067 {
1068     return pcmk__str_any_of(action,
1069                             PCMK_ACTION_START,
1070                             PCMK_ACTION_STOP,
1071                             PCMK_ACTION_MONITOR,
1072                             PCMK_ACTION_MIGRATE_TO,
1073                             PCMK_ACTION_MIGRATE_FROM,
1074                             PCMK_ACTION_RELOAD_AGENT,
1075                             PCMK_ACTION_RELOAD,
1076                             NULL);
1077 }
1078 
1079 static GList *
1080 fail_all_monitor_cmds(GList * list)
     /* [previous][next][first][last][top][bottom][index][help] */
1081 {
1082     GList *rm_list = NULL;
1083     remote_ra_cmd_t *cmd = NULL;
1084     GList *gIter = NULL;
1085 
1086     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1087         cmd = gIter->data;
1088         if ((cmd->interval_ms > 0)
1089             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1090                             pcmk__str_casei)) {
1091             rm_list = g_list_append(rm_list, cmd);
1092         }
1093     }
1094 
1095     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1096         cmd = gIter->data;
1097 
1098         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1099                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
1100         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1101                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1102         report_remote_ra_result(cmd);
1103 
1104         list = g_list_remove(list, cmd);
1105         free_cmd(cmd);
1106     }
1107 
1108     /* frees only the list data, not the cmds */
1109     g_list_free(rm_list);
1110     return list;
1111 }
1112 
1113 static GList *
1114 remove_cmd(GList * list, const char *action, guint interval_ms)
     /* [previous][next][first][last][top][bottom][index][help] */
1115 {
1116     remote_ra_cmd_t *cmd = NULL;
1117     GList *gIter = NULL;
1118 
1119     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1120         cmd = gIter->data;
1121         if ((cmd->interval_ms == interval_ms)
1122             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1123             break;
1124         }
1125         cmd = NULL;
1126     }
1127     if (cmd) {
1128         list = g_list_remove(list, cmd);
1129         free_cmd(cmd);
1130     }
1131     return list;
1132 }
1133 
1134 int
1135 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1136                  const char *action, guint interval_ms)
1137 {
1138     lrm_state_t *connection_rsc = NULL;
1139     remote_ra_data_t *ra_data = NULL;
1140 
1141     connection_rsc = lrm_state_find(rsc_id);
1142     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1143         return -EINVAL;
1144     }
1145 
1146     ra_data = connection_rsc->remote_ra_data;
1147     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1148     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1149                                          interval_ms);
1150     if (ra_data->cur_cmd &&
1151         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1152         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1153 
1154         cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1155     }
1156 
1157     return 0;
1158 }
1159 
1160 static remote_ra_cmd_t *
1161 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /* [previous][next][first][last][top][bottom][index][help] */
1162                    const char *userdata)
1163 {
1164     GList *gIter = NULL;
1165     remote_ra_cmd_t *cmd = NULL;
1166 
1167     /* there are 3 places a potential duplicate monitor operation
1168      * could exist.
1169      * 1. recurring_cmds list. where the op is waiting for its next interval
1170      * 2. cmds list, where the op is queued to get executed immediately
1171      * 3. cur_cmd, which means the monitor op is in flight right now.
1172      */
1173     if (interval_ms == 0) {
1174         return NULL;
1175     }
1176 
1177     if (ra_data->cur_cmd &&
1178         !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1179         (ra_data->cur_cmd->interval_ms == interval_ms)
1180         && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1181                         pcmk__str_casei)) {
1182 
1183         cmd = ra_data->cur_cmd;
1184         goto handle_dup;
1185     }
1186 
1187     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1188         cmd = gIter->data;
1189         if ((cmd->interval_ms == interval_ms)
1190             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1191                             pcmk__str_casei)) {
1192             goto handle_dup;
1193         }
1194     }
1195 
1196     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1197         cmd = gIter->data;
1198         if ((cmd->interval_ms == interval_ms)
1199             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1200                             pcmk__str_casei)) {
1201             goto handle_dup;
1202         }
1203     }
1204 
1205     return NULL;
1206 
1207 handle_dup:
1208 
1209     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1210               cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1211 
1212     /* update the userdata */
1213     if (userdata) {
1214        free(cmd->userdata);
1215        cmd->userdata = pcmk__str_copy(userdata);
1216     }
1217 
1218     /* if we've already reported success, generate a new call id */
1219     if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1220         cmd->start_time = time(NULL);
1221         cmd->call_id = generate_callid();
1222         cmd_clear_flags(cmd, cmd_reported_success);
1223     }
1224 
1225     /* if we have an interval_id set, that means we are in the process of
1226      * waiting for this cmd's next interval. instead of waiting, cancel
1227      * the timer and execute the action immediately */
1228     if (cmd->interval_id) {
1229         g_source_remove(cmd->interval_id);
1230         cmd->interval_id = 0;
1231         recurring_helper(cmd);
1232     }
1233 
1234     return cmd;
1235 }
1236 
1237 /*!
1238  * \internal
1239  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1240  *
1241  * \param[in]     lrm_state      Executor state object for remote connection
1242  * \param[in]     rsc_id         Connection resource ID
1243  * \param[in]     action         Action to execute
1244  * \param[in]     userdata       String to copy and pass to execution callback
1245  * \param[in]     interval_ms    Action interval (in milliseconds)
1246  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1247  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1248  * \param[in,out] params         Connection resource parameters
1249  * \param[out]    call_id        Where to store call ID on success
1250  *
1251  * \return Standard Pacemaker return code
1252  * \note This takes ownership of \p params, which should not be used or freed
1253  *       after calling this function.
1254  */
1255 int
1256 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /* [previous][next][first][last][top][bottom][index][help] */
1257                               const char *action, const char *userdata,
1258                               guint interval_ms, int timeout_ms,
1259                               int start_delay_ms, lrmd_key_value_t *params,
1260                               int *call_id)
1261 {
1262     lrm_state_t *connection_rsc = NULL;
1263     remote_ra_cmd_t *cmd = NULL;
1264     remote_ra_data_t *ra_data = NULL;
1265 
1266     *call_id = 0;
1267 
1268     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1269               && (userdata != NULL) && (call_id != NULL),
1270               lrmd_key_value_freeall(params); return EINVAL);
1271 
1272     if (!is_remote_ra_supported_action(action)) {
1273         lrmd_key_value_freeall(params);
1274         return EOPNOTSUPP;
1275     }
1276 
1277     connection_rsc = lrm_state_find(rsc_id);
1278     if (connection_rsc == NULL) {
1279         lrmd_key_value_freeall(params);
1280         return ENOTCONN;
1281     }
1282 
1283     remote_ra_data_init(connection_rsc);
1284     ra_data = connection_rsc->remote_ra_data;
1285 
1286     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1287     if (cmd) {
1288         *call_id = cmd->call_id;
1289         lrmd_key_value_freeall(params);
1290         return pcmk_rc_ok;
1291     }
1292 
1293     cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1294 
1295     cmd->owner = pcmk__str_copy(lrm_state->node_name);
1296     cmd->rsc_id = pcmk__str_copy(rsc_id);
1297     cmd->action = pcmk__str_copy(action);
1298     cmd->userdata = pcmk__str_copy(userdata);
1299     cmd->interval_ms = interval_ms;
1300     cmd->timeout = timeout_ms;
1301     cmd->start_delay = start_delay_ms;
1302     cmd->params = params;
1303     cmd->start_time = time(NULL);
1304 
1305     cmd->call_id = generate_callid();
1306 
1307     if (cmd->start_delay) {
1308         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1309     }
1310 
1311     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1312     mainloop_set_trigger(ra_data->work);
1313 
1314     *call_id = cmd->call_id;
1315     return pcmk_rc_ok;
1316 }
1317 
1318 /*!
1319  * \internal
1320  * \brief Immediately fail all monitors of a remote node, if proxied here
1321  *
1322  * \param[in] node_name  Name of pacemaker_remote node
1323  */
1324 void
1325 remote_ra_fail(const char *node_name)
     /* [previous][next][first][last][top][bottom][index][help] */
1326 {
1327     lrm_state_t *lrm_state = lrm_state_find(node_name);
1328 
1329     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1330         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1331 
1332         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1333         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1334         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1335     }
1336 }
1337 
1338 /* A guest node fencing implied by host fencing looks like:
1339  *
1340  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1341  *                on_node="lxc1" on_node_uuid="lxc1">
1342  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1343  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1344  *     <downed>
1345  *       <node id="lxc1"/>
1346  *     </downed>
1347  *  </pseudo_event>
1348  */
1349 #define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1350     "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1351 
1352 /*!
1353  * \internal
1354  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1355  *
1356  * \param[in,out] xml  XML of pseudo-action to check
1357  */
1358 void
1359 remote_ra_process_pseudo(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1360 {
1361     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1362 
1363     if (numXpathResults(search) == 1) {
1364         xmlNode *result = getXpathResult(search, 0);
1365 
1366         /* Normally, we handle the necessary side effects of a guest node stop
1367          * action when reporting the remote agent's result. However, if the stop
1368          * is implied due to fencing, it will be a fencing pseudo-event, and
1369          * there won't be a result to report. Handle that case here.
1370          *
1371          * This will result in a duplicate call to remote_node_down() if the
1372          * guest stop was real instead of implied, but that shouldn't hurt.
1373          *
1374          * There is still one corner case that isn't handled: if a guest node
1375          * isn't running any resources when its host is fenced, it will appear
1376          * to be cleanly stopped, so there will be no pseudo-fence, and our
1377          * peer cache state will be incorrect unless and until the guest is
1378          * recovered.
1379          */
1380         if (result) {
1381             const char *remote = pcmk__xe_id(result);
1382 
1383             if (remote) {
1384                 remote_node_down(remote, DOWN_ERASE_LRM);
1385             }
1386         }
1387     }
1388     freeXpathObject(search);
1389 }
1390 
1391 static void
1392 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /* [previous][next][first][last][top][bottom][index][help] */
1393 {
1394     xmlNode *update, *state;
1395     int call_opt;
1396     crm_node_t *node;
1397 
1398     call_opt = crmd_cib_smart_opt();
1399     node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1400     CRM_CHECK(node != NULL, return);
1401     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1402     state = create_node_state_update(node, node_update_none, update,
1403                                      __func__);
1404     crm_xml_add(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1405     if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1406                             NULL) == pcmk_rc_ok) {
1407         /* TODO: still not 100% sure that async update will succeed ... */
1408         if (maintenance) {
1409             lrm_remote_set_flags(lrm_state, remote_in_maint);
1410         } else {
1411             lrm_remote_clear_flags(lrm_state, remote_in_maint);
1412         }
1413     }
1414     free_xml(update);
1415 }
1416 
1417 #define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT         \
1418     "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1419     PCMK__XE_MAINTENANCE
1420 
1421 /*!
1422  * \internal
1423  * \brief Check a pseudo-action holding updates for maintenance state
1424  *
1425  * \param[in,out] xml  XML of pseudo-action to check
1426  */
1427 void
1428 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
1429 {
1430     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1431 
1432     if (numXpathResults(search) == 1) {
1433         xmlNode *node;
1434         int cnt = 0, cnt_remote = 0;
1435 
1436         for (node = pcmk__xe_first_child(getXpathResult(search, 0),
1437                                          PCMK_XE_NODE, NULL, NULL);
1438              node != NULL; node = pcmk__xe_next_same(node)) {
1439 
1440             lrm_state_t *lrm_state = lrm_state_find(pcmk__xe_id(node));
1441 
1442             cnt++;
1443             if (lrm_state && lrm_state->remote_ra_data &&
1444                 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1445 
1446                 const char *in_maint_s = NULL;
1447                 int in_maint;
1448 
1449                 cnt_remote++;
1450                 in_maint_s = crm_element_value(node,
1451                                                PCMK__XA_NODE_IN_MAINTENANCE);
1452                 pcmk__scan_min_int(in_maint_s, &in_maint, 0);
1453                 remote_ra_maintenance(lrm_state, in_maint);
1454             }
1455         }
1456         crm_trace("Action holds %d nodes (%d remotes found) adjusting "
1457                   PCMK_OPT_MAINTENANCE_MODE,
1458                   cnt, cnt_remote);
1459     }
1460     freeXpathObject(search);
1461 }
1462 
1463 gboolean
1464 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1465 {
1466     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1467     return pcmk_is_set(ra_data->status, remote_in_maint);
1468 }
1469 
1470 gboolean
1471 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /* [previous][next][first][last][top][bottom][index][help] */
1472 {
1473     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1474     return pcmk_is_set(ra_data->status, controlling_guest);
1475 }

/* [previous][next][first][last][top][bottom][index][help] */