daemons/controld/controld_remote

/* */
This source file includes following definitions.
free_cmd
generate_callid
recurring_helper
start_delay_helper
should_purge_attributes
section_to_delete
purge_remote_node_attrs
remote_node_up
remote_node_down
check_remote_node_state
report_remote_ra_result
update_remaining_timeout
retry_start_cmd_cb
connection_takeover_timeout_cb
monitor_timeout_cb
synthesize_lrmd_success
remote_lrm_op_callback
handle_remote_ra_stop
handle_remote_ra_start
handle_remote_ra_exec
remote_ra_data_init
remote_ra_cleanup
is_remote_lrmd_ra
remote_ra_get_rsc_info
is_remote_ra_supported_action
fail_all_monitor_cmds
remove_cmd
remote_ra_cancel
handle_dup_monitor
controld_execute_remote_agent
remote_ra_fail
remote_ra_process_pseudo
remote_ra_maintenance
remote_ra_process_maintenance_nodes
remote_ra_is_in_maintenance
remote_ra_controlling_guest
   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <crm/crm.h>
  13 #include <crm/common/xml.h>
  14 #include <crm/common/xml_internal.h>
  15 #include <crm/lrmd.h>
  16 #include <crm/lrmd_internal.h>
  17 #include <crm/services.h>
  18 
  19 #include <pacemaker-controld.h>
  20 
  21 #define REMOTE_LRMD_RA "remote"
  22 
  23 /* The max start timeout before cmd retry */
  24 #define MAX_START_TIMEOUT_MS 10000
  25 
  26 #define cmd_set_flags(cmd, flags_to_set) do { \
  27     (cmd)->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, \
  28                                        "Remote command", (cmd)->rsc_id, (cmd)->status, \
  29                                        (flags_to_set), #flags_to_set); \
  30         } while (0)
  31 
  32 #define cmd_clear_flags(cmd, flags_to_clear) do { \
  33     (cmd)->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, \
  34                                          "Remote command", (cmd)->rsc_id, (cmd)->status, \
  35                                          (flags_to_clear), #flags_to_clear); \
  36         } while (0)
  37 
  38 enum remote_cmd_status {
  39     cmd_reported_success    = (1 << 0),
  40     cmd_cancel              = (1 << 1),
  41 };
  42 
  43 typedef struct remote_ra_cmd_s {
  44     /*! the local node the cmd is issued from */
  45     char *owner;
  46     /*! the remote node the cmd is executed on */
  47     char *rsc_id;
  48     /*! the action to execute */
  49     char *action;
  50     /*! some string the client wants us to give it back */
  51     char *userdata;
  52     /*! start delay in ms */
  53     int start_delay;
  54     /*! timer id used for start delay. */
  55     int delay_id;
  56     /*! timeout in ms for cmd */
  57     int timeout;
  58     int remaining_timeout;
  59     /*! recurring interval in ms */
  60     guint interval_ms;
  61     /*! interval timer id */
  62     int interval_id;
  63     int monitor_timeout_id;
  64     int takeover_timeout_id;
  65     /*! action parameters */
  66     lrmd_key_value_t *params;
  67     pcmk__action_result_t result;
  68     int call_id;
  69     time_t start_time;
  70     uint32_t status;
  71 } remote_ra_cmd_t;
  72 
  73 #define lrm_remote_set_flags(lrm_state, flags_to_set) do { \
  74     lrm_state_t *lrm = (lrm_state); \
  75     remote_ra_data_t *ra = lrm->remote_ra_data; \
  76     ra->status = pcmk__set_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  77                                     lrm->node_name, ra->status, \
  78                                     (flags_to_set), #flags_to_set); \
  79         } while (0)
  80 
  81 #define lrm_remote_clear_flags(lrm_state, flags_to_clear) do { \
  82     lrm_state_t *lrm = (lrm_state); \
  83     remote_ra_data_t *ra = lrm->remote_ra_data; \
  84     ra->status = pcmk__clear_flags_as(__func__, __LINE__, LOG_TRACE, "Remote", \
  85                                       lrm->node_name, ra->status, \
  86                                       (flags_to_clear), #flags_to_clear); \
  87         } while (0)
  88 
  89 enum remote_status {
  90     expect_takeover     = (1 << 0),
  91     takeover_complete   = (1 << 1),
  92     remote_active       = (1 << 2),
  93     /* Maintenance mode is difficult to determine from the controller's context,
  94      * so we have it signalled back with the transition from the scheduler.
  95      */
  96     remote_in_maint     = (1 << 3),
  97     /* Similar for whether we are controlling a guest node or remote node.
  98      * Fortunately there is a meta-attribute in the transition already and
  99      * as the situation doesn't change over time we can use the
 100      * resource start for noting down the information for later use when
 101      * the attributes aren't at hand.
 102      */
 103     controlling_guest   = (1 << 4),
 104 };
 105 
 106 typedef struct remote_ra_data_s {
 107     crm_trigger_t *work;
 108     remote_ra_cmd_t *cur_cmd;
 109     GList *cmds;
 110     GList *recurring_cmds;
 111     uint32_t status;
 112 } remote_ra_data_t;
 113 
 114 static int handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms);
 115 static void handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd);
 116 static GList *fail_all_monitor_cmds(GList * list);
 117 
 118 static void
 119 free_cmd(gpointer user_data)
     /*  */
 120 {
 121     remote_ra_cmd_t *cmd = user_data;
 122 
 123     if (!cmd) {
 124         return;
 125     }
 126     if (cmd->delay_id) {
 127         g_source_remove(cmd->delay_id);
 128     }
 129     if (cmd->interval_id) {
 130         g_source_remove(cmd->interval_id);
 131     }
 132     if (cmd->monitor_timeout_id) {
 133         g_source_remove(cmd->monitor_timeout_id);
 134     }
 135     if (cmd->takeover_timeout_id) {
 136         g_source_remove(cmd->takeover_timeout_id);
 137     }
 138     free(cmd->owner);
 139     free(cmd->rsc_id);
 140     free(cmd->action);
 141     free(cmd->userdata);
 142     pcmk__reset_result(&(cmd->result));
 143     lrmd_key_value_freeall(cmd->params);
 144     free(cmd);
 145 }
 146 
 147 static int
 148 generate_callid(void)
     /*  */
 149 {
 150     static int remote_ra_callid = 0;
 151 
 152     remote_ra_callid++;
 153     if (remote_ra_callid <= 0) {
 154         remote_ra_callid = 1;
 155     }
 156 
 157     return remote_ra_callid;
 158 }
 159 
 160 static gboolean
 161 recurring_helper(gpointer data)
     /*  */
 162 {
 163     remote_ra_cmd_t *cmd = data;
 164     lrm_state_t *connection_rsc = NULL;
 165 
 166     cmd->interval_id = 0;
 167     connection_rsc = lrm_state_find(cmd->rsc_id);
 168     if (connection_rsc && connection_rsc->remote_ra_data) {
 169         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 170 
 171         ra_data->recurring_cmds = g_list_remove(ra_data->recurring_cmds, cmd);
 172 
 173         ra_data->cmds = g_list_append(ra_data->cmds, cmd);
 174         mainloop_set_trigger(ra_data->work);
 175     }
 176     return FALSE;
 177 }
 178 
 179 static gboolean
 180 start_delay_helper(gpointer data)
     /*  */
 181 {
 182     remote_ra_cmd_t *cmd = data;
 183     lrm_state_t *connection_rsc = NULL;
 184 
 185     cmd->delay_id = 0;
 186     connection_rsc = lrm_state_find(cmd->rsc_id);
 187     if (connection_rsc && connection_rsc->remote_ra_data) {
 188         remote_ra_data_t *ra_data = connection_rsc->remote_ra_data;
 189 
 190         mainloop_set_trigger(ra_data->work);
 191     }
 192     return FALSE;
 193 }
 194 
 195 static bool
 196 should_purge_attributes(crm_node_t *node)
     /*  */
 197 {
 198     bool purge = true;
 199     crm_node_t *conn_node = NULL;
 200     lrm_state_t *connection_rsc = NULL;
 201 
 202     if (!node->conn_host) {
 203         return purge;
 204     }
 205 
 206     /* Get the node that was hosting the remote connection resource from the
 207      * peer cache.  That's the one we really care about here.
 208      */
 209     conn_node = pcmk__get_node(0, node->conn_host, NULL,
 210                                pcmk__node_search_cluster_member);
 211     if (conn_node == NULL) {
 212         return purge;
 213     }
 214 
 215     /* Check the uptime of connection_rsc.  If it hasn't been running long
 216      * enough, set purge=true.  "Long enough" means it started running earlier
 217      * than the timestamp when we noticed it went away in the first place.
 218      */
 219     connection_rsc = lrm_state_find(node->uname);
 220 
 221     if (connection_rsc != NULL) {
 222         lrmd_t *lrm = connection_rsc->conn;
 223         time_t uptime = lrmd__uptime(lrm);
 224         time_t now = time(NULL);
 225 
 226         /* Add 20s of fuzziness to give corosync a while to notice the remote
 227          * host is gone.  On various error conditions (failure to get uptime,
 228          * peer_lost isn't set) we default to purging.
 229          */
 230         if (uptime > 0 &&
 231             conn_node->peer_lost > 0 &&
 232             uptime + 20 >= now - conn_node->peer_lost) {
 233             purge = false;
 234         }
 235     }
 236 
 237     return purge;
 238 }
 239 
 240 static enum controld_section_e
 241 section_to_delete(bool purge)
     /*  */
 242 {
 243     if (pcmk_is_set(controld_globals.flags, controld_shutdown_lock_enabled)) {
 244         if (purge) {
 245             return controld_section_all_unlocked;
 246         } else {
 247             return controld_section_lrm_unlocked;
 248         }
 249     } else {
 250         if (purge) {
 251             return controld_section_all;
 252         } else {
 253             return controld_section_lrm;
 254         }
 255     }
 256 }
 257 
 258 static void
 259 purge_remote_node_attrs(int call_opt, crm_node_t *node)
     /*  */
 260 {
 261     bool purge = should_purge_attributes(node);
 262     enum controld_section_e section = section_to_delete(purge);
 263 
 264     /* Purge node from attrd's memory */
 265     if (purge) {
 266         update_attrd_remote_node_removed(node->uname, NULL);
 267     }
 268 
 269     controld_delete_node_state(node->uname, section, call_opt);
 270 }
 271 
 272 /*!
 273  * \internal
 274  * \brief Handle cluster communication related to pacemaker_remote node joining
 275  *
 276  * \param[in] node_name  Name of newly integrated pacemaker_remote node
 277  */
 278 static void
 279 remote_node_up(const char *node_name)
     /*  */
 280 {
 281     int call_opt;
 282     xmlNode *update, *state;
 283     crm_node_t *node;
 284     lrm_state_t *connection_rsc = NULL;
 285 
 286     CRM_CHECK(node_name != NULL, return);
 287     crm_info("Announcing Pacemaker Remote node %s", node_name);
 288 
 289     call_opt = crmd_cib_smart_opt();
 290 
 291     /* Delete node's probe_complete attribute. This serves two purposes:
 292      *
 293      * - @COMPAT DCs < 1.1.14 in a rolling upgrade might use it
 294      * - deleting it (or any attribute for that matter) here ensures the
 295      *   attribute manager learns the node is remote
 296      */
 297     update_attrd(node_name, CRM_OP_PROBED, NULL, NULL, TRUE);
 298 
 299     /* Ensure node is in the remote peer cache with member status */
 300     node = pcmk__cluster_lookup_remote_node(node_name);
 301     CRM_CHECK(node != NULL, return);
 302 
 303     purge_remote_node_attrs(call_opt, node);
 304     pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 305 
 306     /* Apply any start state that we were given from the environment on the
 307      * remote node.
 308      */
 309     connection_rsc = lrm_state_find(node->uname);
 310 
 311     if (connection_rsc != NULL) {
 312         lrmd_t *lrm = connection_rsc->conn;
 313         const char *start_state = lrmd__node_start_state(lrm);
 314 
 315         if (start_state) {
 316             set_join_state(start_state, node->uname, node->uuid, true);
 317         }
 318     }
 319 
 320     /* pacemaker_remote nodes don't participate in the membership layer,
 321      * so cluster nodes don't automatically get notified when they come and go.
 322      * We send a cluster message to the DC, and update the CIB node state entry,
 323      * so the DC will get it sooner (via message) or later (via CIB refresh),
 324      * and any other interested parties can query the CIB.
 325      */
 326     broadcast_remote_state_message(node_name, true);
 327 
 328     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 329     state = create_node_state_update(node, node_update_cluster, update,
 330                                      __func__);
 331 
 332     /* Clear the PCMK__XA_NODE_FENCED flag in the node state. If the node ever
 333      * needs to be fenced, this flag will allow various actions to determine
 334      * whether the fencing has happened yet.
 335      */
 336     crm_xml_add(state, PCMK__XA_NODE_FENCED, "0");
 337 
 338     /* TODO: If the remote connection drops, and this (async) CIB update either
 339      * failed or has not yet completed, later actions could mistakenly think the
 340      * node has already been fenced (if the PCMK__XA_NODE_FENCED attribute was
 341      * previously set, because it won't have been cleared). This could prevent
 342      * actual fencing or allow recurring monitor failures to be cleared too
 343      * soon. Ideally, we wouldn't rely on the CIB for the fenced status.
 344      */
 345     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 346     free_xml(update);
 347 }
 348 
 349 enum down_opts {
 350     DOWN_KEEP_LRM,
 351     DOWN_ERASE_LRM
 352 };
 353 
 354 /*!
 355  * \internal
 356  * \brief Handle cluster communication related to pacemaker_remote node leaving
 357  *
 358  * \param[in] node_name  Name of lost node
 359  * \param[in] opts       Whether to keep or erase LRM history
 360  */
 361 static void
 362 remote_node_down(const char *node_name, const enum down_opts opts)
     /*  */
 363 {
 364     xmlNode *update;
 365     int call_opt = crmd_cib_smart_opt();
 366     crm_node_t *node;
 367 
 368     /* Purge node from attrd's memory */
 369     update_attrd_remote_node_removed(node_name, NULL);
 370 
 371     /* Normally, only node attributes should be erased, and the resource history
 372      * should be kept until the node comes back up. However, after a successful
 373      * fence, we want to clear the history as well, so we don't think resources
 374      * are still running on the node.
 375      */
 376     if (opts == DOWN_ERASE_LRM) {
 377         controld_delete_node_state(node_name, controld_section_all, call_opt);
 378     } else {
 379         controld_delete_node_state(node_name, controld_section_attrs, call_opt);
 380     }
 381 
 382     /* Ensure node is in the remote peer cache with lost state */
 383     node = pcmk__cluster_lookup_remote_node(node_name);
 384     CRM_CHECK(node != NULL, return);
 385     pcmk__update_peer_state(__func__, node, CRM_NODE_LOST, 0);
 386 
 387     /* Notify DC */
 388     broadcast_remote_state_message(node_name, false);
 389 
 390     /* Update CIB node state */
 391     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
 392     create_node_state_update(node, node_update_cluster, update, __func__);
 393     controld_update_cib(PCMK_XE_STATUS, update, call_opt, NULL);
 394     free_xml(update);
 395 }
 396 
 397 /*!
 398  * \internal
 399  * \brief Handle effects of a remote RA command on node state
 400  *
 401  * \param[in] cmd  Completed remote RA command
 402  */
 403 static void
 404 check_remote_node_state(const remote_ra_cmd_t *cmd)
     /*  */
 405 {
 406     /* Only successful actions can change node state */
 407     if (!pcmk__result_ok(&(cmd->result))) {
 408         return;
 409     }
 410 
 411     if (pcmk__str_eq(cmd->action, PCMK_ACTION_START, pcmk__str_casei)) {
 412         remote_node_up(cmd->rsc_id);
 413 
 414     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_MIGRATE_FROM,
 415                             pcmk__str_casei)) {
 416         /* After a successful migration, we don't need to do remote_node_up()
 417          * because the DC already knows the node is up, and we don't want to
 418          * clear LRM history etc. We do need to add the remote node to this
 419          * host's remote peer cache, because (unless it happens to be DC)
 420          * it hasn't been tracking the remote node, and other code relies on
 421          * the cache to distinguish remote nodes from unseen cluster nodes.
 422          */
 423         crm_node_t *node = pcmk__cluster_lookup_remote_node(cmd->rsc_id);
 424 
 425         CRM_CHECK(node != NULL, return);
 426         pcmk__update_peer_state(__func__, node, CRM_NODE_MEMBER, 0);
 427 
 428     } else if (pcmk__str_eq(cmd->action, PCMK_ACTION_STOP, pcmk__str_casei)) {
 429         lrm_state_t *lrm_state = lrm_state_find(cmd->rsc_id);
 430         remote_ra_data_t *ra_data = lrm_state? lrm_state->remote_ra_data : NULL;
 431 
 432         if (ra_data) {
 433             if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 434                 /* Stop means down if we didn't successfully migrate elsewhere */
 435                 remote_node_down(cmd->rsc_id, DOWN_KEEP_LRM);
 436             } else if (AM_I_DC == FALSE) {
 437                 /* Only the connection host and DC track node state,
 438                  * so if the connection migrated elsewhere and we aren't DC,
 439                  * un-cache the node, so we don't have stale info
 440                  */
 441                 pcmk__cluster_forget_remote_node(cmd->rsc_id);
 442             }
 443         }
 444     }
 445 
 446     /* We don't do anything for successful monitors, which is correct for
 447      * routine recurring monitors, and for monitors on nodes where the
 448      * connection isn't supposed to be (the cluster will stop the connection in
 449      * that case). However, if the initial probe finds the connection already
 450      * active on the node where we want it, we probably should do
 451      * remote_node_up(). Unfortunately, we can't distinguish that case here.
 452      * Given that connections have to be initiated by the cluster, the chance of
 453      * that should be close to zero.
 454      */
 455 }
 456 
 457 static void
 458 report_remote_ra_result(remote_ra_cmd_t * cmd)
     /*  */
 459 {
 460     lrmd_event_data_t op = { 0, };
 461 
 462     check_remote_node_state(cmd);
 463 
 464     op.type = lrmd_event_exec_complete;
 465     op.rsc_id = cmd->rsc_id;
 466     op.op_type = cmd->action;
 467     op.user_data = cmd->userdata;
 468     op.timeout = cmd->timeout;
 469     op.interval_ms = cmd->interval_ms;
 470     op.t_run = (unsigned int) cmd->start_time;
 471     op.t_rcchange = (unsigned int) cmd->start_time;
 472 
 473     lrmd__set_result(&op, cmd->result.exit_status, cmd->result.execution_status,
 474                      cmd->result.exit_reason);
 475 
 476     if (pcmk_is_set(cmd->status, cmd_reported_success) && !pcmk__result_ok(&(cmd->result))) {
 477         op.t_rcchange = (unsigned int) time(NULL);
 478         /* This edge case will likely never ever occur, but if it does the
 479          * result is that a failure will not be processed correctly. This is only
 480          * remotely possible because we are able to detect a connection resource's tcp
 481          * connection has failed at any moment after start has completed. The actual
 482          * recurring operation is just a connectivity ping.
 483          *
 484          * basically, we are not guaranteed that the first successful monitor op and
 485          * a subsequent failed monitor op will not occur in the same timestamp. We have to
 486          * make it look like the operations occurred at separate times though. */
 487         if (op.t_rcchange == op.t_run) {
 488             op.t_rcchange++;
 489         }
 490     }
 491 
 492     if (cmd->params) {
 493         lrmd_key_value_t *tmp;
 494 
 495         op.params = pcmk__strkey_table(free, free);
 496         for (tmp = cmd->params; tmp; tmp = tmp->next) {
 497             pcmk__insert_dup(op.params, tmp->key, tmp->value);
 498         }
 499 
 500     }
 501     op.call_id = cmd->call_id;
 502     op.remote_nodename = cmd->owner;
 503 
 504     lrm_op_callback(&op);
 505 
 506     if (op.params) {
 507         g_hash_table_destroy(op.params);
 508     }
 509     lrmd__reset_result(&op);
 510 }
 511 
 512 static void
 513 update_remaining_timeout(remote_ra_cmd_t * cmd)
     /*  */
 514 {
 515     cmd->remaining_timeout = ((cmd->timeout / 1000) - (time(NULL) - cmd->start_time)) * 1000;
 516 }
 517 
 518 static gboolean
 519 retry_start_cmd_cb(gpointer data)
     /*  */
 520 {
 521     lrm_state_t *lrm_state = data;
 522     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 523     remote_ra_cmd_t *cmd = NULL;
 524     int rc = ETIME;
 525 
 526     if (!ra_data || !ra_data->cur_cmd) {
 527         return FALSE;
 528     }
 529     cmd = ra_data->cur_cmd;
 530     if (!pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 531                               PCMK_ACTION_MIGRATE_FROM, NULL)) {
 532         return FALSE;
 533     }
 534     update_remaining_timeout(cmd);
 535 
 536     if (cmd->remaining_timeout > 0) {
 537         rc = handle_remote_ra_start(lrm_state, cmd, cmd->remaining_timeout);
 538     } else {
 539         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 540                          PCMK_EXEC_TIMEOUT,
 541                          "Not enough time remains to retry remote connection");
 542     }
 543 
 544     if (rc != pcmk_rc_ok) {
 545         report_remote_ra_result(cmd);
 546 
 547         if (ra_data->cmds) {
 548             mainloop_set_trigger(ra_data->work);
 549         }
 550         ra_data->cur_cmd = NULL;
 551         free_cmd(cmd);
 552     } else {
 553         /* wait for connection event */
 554     }
 555 
 556     return FALSE;
 557 }
 558 
 559 
 560 static gboolean
 561 connection_takeover_timeout_cb(gpointer data)
     /*  */
 562 {
 563     lrm_state_t *lrm_state = NULL;
 564     remote_ra_cmd_t *cmd = data;
 565 
 566     crm_info("takeover event timed out for node %s", cmd->rsc_id);
 567     cmd->takeover_timeout_id = 0;
 568 
 569     lrm_state = lrm_state_find(cmd->rsc_id);
 570 
 571     handle_remote_ra_stop(lrm_state, cmd);
 572     free_cmd(cmd);
 573 
 574     return FALSE;
 575 }
 576 
 577 static gboolean
 578 monitor_timeout_cb(gpointer data)
     /*  */
 579 {
 580     lrm_state_t *lrm_state = NULL;
 581     remote_ra_cmd_t *cmd = data;
 582 
 583     lrm_state = lrm_state_find(cmd->rsc_id);
 584 
 585     crm_info("Timed out waiting for remote poke response from %s%s",
 586              cmd->rsc_id, (lrm_state? "" : " (no LRM state)"));
 587     cmd->monitor_timeout_id = 0;
 588     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR, PCMK_EXEC_TIMEOUT,
 589                      "Remote executor did not respond");
 590 
 591     if (lrm_state && lrm_state->remote_ra_data) {
 592         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 593 
 594         if (ra_data->cur_cmd == cmd) {
 595             ra_data->cur_cmd = NULL;
 596         }
 597         if (ra_data->cmds) {
 598             mainloop_set_trigger(ra_data->work);
 599         }
 600     }
 601 
 602     report_remote_ra_result(cmd);
 603     free_cmd(cmd);
 604 
 605     if(lrm_state) {
 606         lrm_state_disconnect(lrm_state);
 607     }
 608     return FALSE;
 609 }
 610 
 611 static void
 612 synthesize_lrmd_success(lrm_state_t *lrm_state, const char *rsc_id, const char *op_type)
     /*  */
 613 {
 614     lrmd_event_data_t op = { 0, };
 615 
 616     if (lrm_state == NULL) {
 617         /* if lrm_state not given assume local */
 618         lrm_state = lrm_state_find(controld_globals.our_nodename);
 619     }
 620     CRM_ASSERT(lrm_state != NULL);
 621 
 622     op.type = lrmd_event_exec_complete;
 623     op.rsc_id = rsc_id;
 624     op.op_type = op_type;
 625     op.t_run = (unsigned int) time(NULL);
 626     op.t_rcchange = op.t_run;
 627     op.call_id = generate_callid();
 628     lrmd__set_result(&op, PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 629     process_lrm_event(lrm_state, &op, NULL, NULL);
 630 }
 631 
 632 void
 633 remote_lrm_op_callback(lrmd_event_data_t * op)
     /*  */
 634 {
 635     gboolean cmd_handled = FALSE;
 636     lrm_state_t *lrm_state = NULL;
 637     remote_ra_data_t *ra_data = NULL;
 638     remote_ra_cmd_t *cmd = NULL;
 639 
 640     crm_debug("Processing '%s%s%s' event on remote connection to %s: %s "
 641               "(%d) status=%s (%d)",
 642               (op->op_type? op->op_type : ""), (op->op_type? " " : ""),
 643               lrmd_event_type2str(op->type), op->remote_nodename,
 644               services_ocf_exitcode_str(op->rc), op->rc,
 645               pcmk_exec_status_str(op->op_status), op->op_status);
 646 
 647     lrm_state = lrm_state_find(op->remote_nodename);
 648     if (!lrm_state || !lrm_state->remote_ra_data) {
 649         crm_debug("No state information found for remote connection event");
 650         return;
 651     }
 652     ra_data = lrm_state->remote_ra_data;
 653 
 654     if (op->type == lrmd_event_new_client) {
 655         // Another client has connected to the remote daemon
 656 
 657         if (pcmk_is_set(ra_data->status, expect_takeover)) {
 658             // Great, we knew this was coming
 659             lrm_remote_clear_flags(lrm_state, expect_takeover);
 660             lrm_remote_set_flags(lrm_state, takeover_complete);
 661 
 662         } else {
 663             crm_err("Disconnecting from Pacemaker Remote node %s due to "
 664                     "unexpected client takeover", op->remote_nodename);
 665             /* In this case, lrmd_tls_connection_destroy() will be called under the control of mainloop. */
 666             /* Do not free lrm_state->conn yet. */
 667             /* It'll be freed in the following stop action. */
 668             lrm_state_disconnect_only(lrm_state);
 669         }
 670         return;
 671     }
 672 
 673     /* filter all EXEC events up */
 674     if (op->type == lrmd_event_exec_complete) {
 675         if (pcmk_is_set(ra_data->status, takeover_complete)) {
 676             crm_debug("ignoring event, this connection is taken over by another node");
 677         } else {
 678             lrm_op_callback(op);
 679         }
 680         return;
 681     }
 682 
 683     if ((op->type == lrmd_event_disconnect) && (ra_data->cur_cmd == NULL)) {
 684 
 685         if (!pcmk_is_set(ra_data->status, remote_active)) {
 686             crm_debug("Disconnection from Pacemaker Remote node %s complete",
 687                       lrm_state->node_name);
 688 
 689         } else if (!remote_ra_is_in_maintenance(lrm_state)) {
 690             crm_err("Lost connection to Pacemaker Remote node %s",
 691                     lrm_state->node_name);
 692             ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
 693             ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
 694 
 695         } else {
 696             crm_notice("Unmanaged Pacemaker Remote node %s disconnected",
 697                        lrm_state->node_name);
 698             /* Do roughly what a 'stop' on the remote-resource would do */
 699             handle_remote_ra_stop(lrm_state, NULL);
 700             remote_node_down(lrm_state->node_name, DOWN_KEEP_LRM);
 701             /* now fake the reply of a successful 'stop' */
 702             synthesize_lrmd_success(NULL, lrm_state->node_name,
 703                                     PCMK_ACTION_STOP);
 704         }
 705         return;
 706     }
 707 
 708     if (!ra_data->cur_cmd) {
 709         crm_debug("no event to match");
 710         return;
 711     }
 712 
 713     cmd = ra_data->cur_cmd;
 714 
 715     /* Start actions and migrate from actions complete after connection
 716      * comes back to us. */
 717     if ((op->type == lrmd_event_connect)
 718         && pcmk__strcase_any_of(cmd->action, PCMK_ACTION_START,
 719                                 PCMK_ACTION_MIGRATE_FROM, NULL)) {
 720         if (op->connection_rc < 0) {
 721             update_remaining_timeout(cmd);
 722 
 723             if ((op->connection_rc == -ENOKEY)
 724                 || (op->connection_rc == -EKEYREJECTED)) {
 725                 // Hard error, don't retry
 726                 pcmk__set_result(&(cmd->result), PCMK_OCF_INVALID_PARAM,
 727                                  PCMK_EXEC_ERROR,
 728                                  pcmk_strerror(op->connection_rc));
 729 
 730             } else if (cmd->remaining_timeout > 3000) {
 731                 crm_trace("rescheduling start, remaining timeout %d", cmd->remaining_timeout);
 732                 g_timeout_add(1000, retry_start_cmd_cb, lrm_state);
 733                 return;
 734 
 735             } else {
 736                 crm_trace("can't reschedule start, remaining timeout too small %d",
 737                           cmd->remaining_timeout);
 738                 pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 739                                     PCMK_EXEC_TIMEOUT,
 740                                     "%s without enough time to retry",
 741                                     pcmk_strerror(op->connection_rc));
 742             }
 743 
 744         } else {
 745             lrm_state_reset_tables(lrm_state, TRUE);
 746             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 747             lrm_remote_set_flags(lrm_state, remote_active);
 748         }
 749 
 750         crm_debug("Remote connection event matched %s action", cmd->action);
 751         report_remote_ra_result(cmd);
 752         cmd_handled = TRUE;
 753 
 754     } else if ((op->type == lrmd_event_poke)
 755                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 756                                pcmk__str_casei)) {
 757 
 758         if (cmd->monitor_timeout_id) {
 759             g_source_remove(cmd->monitor_timeout_id);
 760             cmd->monitor_timeout_id = 0;
 761         }
 762 
 763         /* Only report success the first time, after that only worry about failures.
 764          * For this function, if we get the poke pack, it is always a success. Pokes
 765          * only fail if the send fails, or the response times out. */
 766         if (!pcmk_is_set(cmd->status, cmd_reported_success)) {
 767             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 768             report_remote_ra_result(cmd);
 769             cmd_set_flags(cmd, cmd_reported_success);
 770         }
 771 
 772         crm_debug("Remote poke event matched %s action", cmd->action);
 773 
 774         /* success, keep rescheduling if interval is present. */
 775         if (cmd->interval_ms && !pcmk_is_set(cmd->status, cmd_cancel)) {
 776             ra_data->recurring_cmds = g_list_append(ra_data->recurring_cmds, cmd);
 777             cmd->interval_id = g_timeout_add(cmd->interval_ms,
 778                                              recurring_helper, cmd);
 779             cmd = NULL;         /* prevent free */
 780         }
 781         cmd_handled = TRUE;
 782 
 783     } else if ((op->type == lrmd_event_disconnect)
 784                && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
 785                                pcmk__str_casei)) {
 786         if (pcmk_is_set(ra_data->status, remote_active) &&
 787             !pcmk_is_set(cmd->status, cmd_cancel)) {
 788             pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 789                              PCMK_EXEC_ERROR,
 790                              "Remote connection unexpectedly dropped "
 791                              "during monitor");
 792             report_remote_ra_result(cmd);
 793             crm_err("Remote connection to %s unexpectedly dropped during monitor",
 794                     lrm_state->node_name);
 795         }
 796         cmd_handled = TRUE;
 797 
 798     } else if ((op->type == lrmd_event_new_client)
 799                && pcmk__str_eq(cmd->action, PCMK_ACTION_STOP,
 800                                pcmk__str_casei)) {
 801 
 802         handle_remote_ra_stop(lrm_state, cmd);
 803         cmd_handled = TRUE;
 804 
 805     } else {
 806         crm_debug("Event did not match %s action", ra_data->cur_cmd->action);
 807     }
 808 
 809     if (cmd_handled) {
 810         ra_data->cur_cmd = NULL;
 811         if (ra_data->cmds) {
 812             mainloop_set_trigger(ra_data->work);
 813         }
 814         free_cmd(cmd);
 815     }
 816 }
 817 
 818 static void
 819 handle_remote_ra_stop(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd)
     /*  */
 820 {
 821     remote_ra_data_t *ra_data = NULL;
 822 
 823     CRM_ASSERT(lrm_state);
 824     ra_data = lrm_state->remote_ra_data;
 825 
 826     if (!pcmk_is_set(ra_data->status, takeover_complete)) {
 827         /* delete pending ops when ever the remote connection is intentionally stopped */
 828         g_hash_table_remove_all(lrm_state->active_ops);
 829     } else {
 830         /* we no longer hold the history if this connection has been migrated,
 831          * however, we keep metadata cache for future use */
 832         lrm_state_reset_tables(lrm_state, FALSE);
 833     }
 834 
 835     lrm_remote_clear_flags(lrm_state, remote_active);
 836     lrm_state_disconnect(lrm_state);
 837 
 838     if (ra_data->cmds) {
 839         g_list_free_full(ra_data->cmds, free_cmd);
 840     }
 841     if (ra_data->recurring_cmds) {
 842         g_list_free_full(ra_data->recurring_cmds, free_cmd);
 843     }
 844     ra_data->cmds = NULL;
 845     ra_data->recurring_cmds = NULL;
 846     ra_data->cur_cmd = NULL;
 847 
 848     if (cmd) {
 849         pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 850         report_remote_ra_result(cmd);
 851     }
 852 }
 853 
 854 // \return Standard Pacemaker return code
 855 static int
 856 handle_remote_ra_start(lrm_state_t * lrm_state, remote_ra_cmd_t * cmd, int timeout_ms)
     /*  */
 857 {
 858     const char *server = NULL;
 859     lrmd_key_value_t *tmp = NULL;
 860     int port = 0;
 861     int timeout_used = timeout_ms > MAX_START_TIMEOUT_MS ? MAX_START_TIMEOUT_MS : timeout_ms;
 862     int rc = pcmk_rc_ok;
 863 
 864     for (tmp = cmd->params; tmp; tmp = tmp->next) {
 865         if (pcmk__strcase_any_of(tmp->key,
 866                                  PCMK_REMOTE_RA_ADDR, PCMK_REMOTE_RA_SERVER,
 867                                  NULL)) {
 868             server = tmp->value;
 869 
 870         } else if (pcmk__str_eq(tmp->key, PCMK_REMOTE_RA_PORT,
 871                                 pcmk__str_none)) {
 872             port = atoi(tmp->value);
 873 
 874         } else if (pcmk__str_eq(tmp->key, CRM_META "_" PCMK__META_CONTAINER,
 875                                 pcmk__str_none)) {
 876             lrm_remote_set_flags(lrm_state, controlling_guest);
 877         }
 878     }
 879 
 880     rc = controld_connect_remote_executor(lrm_state, server, port,
 881                                           timeout_used);
 882     if (rc != pcmk_rc_ok) {
 883         pcmk__format_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 884                             PCMK_EXEC_ERROR,
 885                             "Could not connect to Pacemaker Remote node %s: %s",
 886                             lrm_state->node_name, pcmk_rc_str(rc));
 887     }
 888     return rc;
 889 }
 890 
 891 static gboolean
 892 handle_remote_ra_exec(gpointer user_data)
     /*  */
 893 {
 894     int rc = 0;
 895     lrm_state_t *lrm_state = user_data;
 896     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
 897     remote_ra_cmd_t *cmd;
 898     GList *first = NULL;
 899 
 900     if (ra_data->cur_cmd) {
 901         /* still waiting on previous cmd */
 902         return TRUE;
 903     }
 904 
 905     while (ra_data->cmds) {
 906         first = ra_data->cmds;
 907         cmd = first->data;
 908         if (cmd->delay_id) {
 909             /* still waiting for start delay timer to trip */
 910             return TRUE;
 911         }
 912 
 913         ra_data->cmds = g_list_remove_link(ra_data->cmds, first);
 914         g_list_free_1(first);
 915 
 916         if (pcmk__str_any_of(cmd->action, PCMK_ACTION_START,
 917                              PCMK_ACTION_MIGRATE_FROM, NULL)) {
 918             lrm_remote_clear_flags(lrm_state, expect_takeover | takeover_complete);
 919             if (handle_remote_ra_start(lrm_state, cmd,
 920                                        cmd->timeout) == pcmk_rc_ok) {
 921                 /* take care of this later when we get async connection result */
 922                 crm_debug("Initiated async remote connection, %s action will complete after connect event",
 923                           cmd->action);
 924                 ra_data->cur_cmd = cmd;
 925                 return TRUE;
 926             }
 927             report_remote_ra_result(cmd);
 928 
 929         } else if (!strcmp(cmd->action, PCMK_ACTION_MONITOR)) {
 930 
 931             if (lrm_state_is_connected(lrm_state) == TRUE) {
 932                 rc = lrm_state_poke_connection(lrm_state);
 933                 if (rc < 0) {
 934                     pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
 935                                      PCMK_EXEC_ERROR, pcmk_strerror(rc));
 936                 }
 937             } else {
 938                 rc = -1;
 939                 pcmk__set_result(&(cmd->result), PCMK_OCF_NOT_RUNNING,
 940                                  PCMK_EXEC_DONE, "Remote connection inactive");
 941             }
 942 
 943             if (rc == 0) {
 944                 crm_debug("Poked Pacemaker Remote at node %s, waiting for async response",
 945                           cmd->rsc_id);
 946                 ra_data->cur_cmd = cmd;
 947                 cmd->monitor_timeout_id = g_timeout_add(cmd->timeout, monitor_timeout_cb, cmd);
 948                 return TRUE;
 949             }
 950             report_remote_ra_result(cmd);
 951 
 952         } else if (!strcmp(cmd->action, PCMK_ACTION_STOP)) {
 953 
 954             if (pcmk_is_set(ra_data->status, expect_takeover)) {
 955                 /* briefly wait on stop for the takeover event to occur. If the
 956                  * takeover event does not occur during the wait period, that's fine.
 957                  * It just means that the remote-node's lrm_status section is going to get
 958                  * cleared which will require all the resources running in the remote-node
 959                  * to be explicitly re-detected via probe actions.  If the takeover does occur
 960                  * successfully, then we can leave the status section intact. */
 961                 cmd->takeover_timeout_id = g_timeout_add((cmd->timeout/2), connection_takeover_timeout_cb, cmd);
 962                 ra_data->cur_cmd = cmd;
 963                 return TRUE;
 964             }
 965 
 966             handle_remote_ra_stop(lrm_state, cmd);
 967 
 968         } else if (strcmp(cmd->action, PCMK_ACTION_MIGRATE_TO) == 0) {
 969             lrm_remote_clear_flags(lrm_state, takeover_complete);
 970             lrm_remote_set_flags(lrm_state, expect_takeover);
 971             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 972             report_remote_ra_result(cmd);
 973 
 974         } else if (pcmk__str_any_of(cmd->action, PCMK_ACTION_RELOAD,
 975                                     PCMK_ACTION_RELOAD_AGENT, NULL))  {
 976             /* Currently the only reloadable parameter is
 977              * PCMK_REMOTE_RA_RECONNECT_INTERVAL, which is only used by the
 978              * scheduler via the CIB, so reloads are a no-op.
 979              *
 980              * @COMPAT DC <2.1.0: We only need to check for "reload" in case
 981              * we're in a rolling upgrade with a DC scheduling "reload" instead
 982              * of "reload-agent". An OCF 1.1 "reload" would be a no-op anyway,
 983              * so this would work for that purpose as well.
 984              */
 985             pcmk__set_result(&(cmd->result), PCMK_OCF_OK, PCMK_EXEC_DONE, NULL);
 986             report_remote_ra_result(cmd);
 987         }
 988 
 989         free_cmd(cmd);
 990     }
 991 
 992     return TRUE;
 993 }
 994 
 995 static void
 996 remote_ra_data_init(lrm_state_t * lrm_state)
     /*  */
 997 {
 998     remote_ra_data_t *ra_data = NULL;
 999 
1000     if (lrm_state->remote_ra_data) {
1001         return;
1002     }
1003 
1004     ra_data = pcmk__assert_alloc(1, sizeof(remote_ra_data_t));
1005     ra_data->work = mainloop_add_trigger(G_PRIORITY_HIGH, handle_remote_ra_exec, lrm_state);
1006     lrm_state->remote_ra_data = ra_data;
1007 }
1008 
1009 void
1010 remote_ra_cleanup(lrm_state_t * lrm_state)
     /*  */
1011 {
1012     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1013 
1014     if (!ra_data) {
1015         return;
1016     }
1017 
1018     if (ra_data->cmds) {
1019         g_list_free_full(ra_data->cmds, free_cmd);
1020     }
1021 
1022     if (ra_data->recurring_cmds) {
1023         g_list_free_full(ra_data->recurring_cmds, free_cmd);
1024     }
1025     mainloop_destroy_trigger(ra_data->work);
1026     free(ra_data);
1027     lrm_state->remote_ra_data = NULL;
1028 }
1029 
1030 gboolean
1031 is_remote_lrmd_ra(const char *agent, const char *provider, const char *id)
     /*  */
1032 {
1033     if (agent && provider && !strcmp(agent, REMOTE_LRMD_RA) && !strcmp(provider, "pacemaker")) {
1034         return TRUE;
1035     }
1036     if ((id != NULL) && (lrm_state_find(id) != NULL)
1037         && !pcmk__str_eq(id, controld_globals.our_nodename, pcmk__str_casei)) {
1038         return TRUE;
1039     }
1040 
1041     return FALSE;
1042 }
1043 
1044 lrmd_rsc_info_t *
1045 remote_ra_get_rsc_info(lrm_state_t * lrm_state, const char *rsc_id)
     /*  */
1046 {
1047     lrmd_rsc_info_t *info = NULL;
1048 
1049     if ((lrm_state_find(rsc_id))) {
1050         info = pcmk__assert_alloc(1, sizeof(lrmd_rsc_info_t));
1051 
1052         info->id = pcmk__str_copy(rsc_id);
1053         info->type = pcmk__str_copy(REMOTE_LRMD_RA);
1054         info->standard = pcmk__str_copy(PCMK_RESOURCE_CLASS_OCF);
1055         info->provider = pcmk__str_copy("pacemaker");
1056     }
1057 
1058     return info;
1059 }
1060 
1061 static gboolean
1062 is_remote_ra_supported_action(const char *action)
     /*  */
1063 {
1064     return pcmk__str_any_of(action,
1065                             PCMK_ACTION_START,
1066                             PCMK_ACTION_STOP,
1067                             PCMK_ACTION_MONITOR,
1068                             PCMK_ACTION_MIGRATE_TO,
1069                             PCMK_ACTION_MIGRATE_FROM,
1070                             PCMK_ACTION_RELOAD_AGENT,
1071                             PCMK_ACTION_RELOAD,
1072                             NULL);
1073 }
1074 
1075 static GList *
1076 fail_all_monitor_cmds(GList * list)
     /*  */
1077 {
1078     GList *rm_list = NULL;
1079     remote_ra_cmd_t *cmd = NULL;
1080     GList *gIter = NULL;
1081 
1082     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1083         cmd = gIter->data;
1084         if ((cmd->interval_ms > 0)
1085             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1086                             pcmk__str_casei)) {
1087             rm_list = g_list_append(rm_list, cmd);
1088         }
1089     }
1090 
1091     for (gIter = rm_list; gIter != NULL; gIter = gIter->next) {
1092         cmd = gIter->data;
1093 
1094         pcmk__set_result(&(cmd->result), PCMK_OCF_UNKNOWN_ERROR,
1095                          PCMK_EXEC_ERROR, "Lost connection to remote executor");
1096         crm_trace("Pre-emptively failing %s %s (interval=%u, %s)",
1097                   cmd->action, cmd->rsc_id, cmd->interval_ms, cmd->userdata);
1098         report_remote_ra_result(cmd);
1099 
1100         list = g_list_remove(list, cmd);
1101         free_cmd(cmd);
1102     }
1103 
1104     /* frees only the list data, not the cmds */
1105     g_list_free(rm_list);
1106     return list;
1107 }
1108 
1109 static GList *
1110 remove_cmd(GList * list, const char *action, guint interval_ms)
     /*  */
1111 {
1112     remote_ra_cmd_t *cmd = NULL;
1113     GList *gIter = NULL;
1114 
1115     for (gIter = list; gIter != NULL; gIter = gIter->next) {
1116         cmd = gIter->data;
1117         if ((cmd->interval_ms == interval_ms)
1118             && pcmk__str_eq(cmd->action, action, pcmk__str_casei)) {
1119             break;
1120         }
1121         cmd = NULL;
1122     }
1123     if (cmd) {
1124         list = g_list_remove(list, cmd);
1125         free_cmd(cmd);
1126     }
1127     return list;
1128 }
1129 
1130 int
1131 remote_ra_cancel(lrm_state_t *lrm_state, const char *rsc_id,
     /*  */
1132                  const char *action, guint interval_ms)
1133 {
1134     lrm_state_t *connection_rsc = NULL;
1135     remote_ra_data_t *ra_data = NULL;
1136 
1137     connection_rsc = lrm_state_find(rsc_id);
1138     if (!connection_rsc || !connection_rsc->remote_ra_data) {
1139         return -EINVAL;
1140     }
1141 
1142     ra_data = connection_rsc->remote_ra_data;
1143     ra_data->cmds = remove_cmd(ra_data->cmds, action, interval_ms);
1144     ra_data->recurring_cmds = remove_cmd(ra_data->recurring_cmds, action,
1145                                          interval_ms);
1146     if (ra_data->cur_cmd &&
1147         (ra_data->cur_cmd->interval_ms == interval_ms) &&
1148         (pcmk__str_eq(ra_data->cur_cmd->action, action, pcmk__str_casei))) {
1149 
1150         cmd_set_flags(ra_data->cur_cmd, cmd_cancel);
1151     }
1152 
1153     return 0;
1154 }
1155 
1156 static remote_ra_cmd_t *
1157 handle_dup_monitor(remote_ra_data_t *ra_data, guint interval_ms,
     /*  */
1158                    const char *userdata)
1159 {
1160     GList *gIter = NULL;
1161     remote_ra_cmd_t *cmd = NULL;
1162 
1163     /* there are 3 places a potential duplicate monitor operation
1164      * could exist.
1165      * 1. recurring_cmds list. where the op is waiting for its next interval
1166      * 2. cmds list, where the op is queued to get executed immediately
1167      * 3. cur_cmd, which means the monitor op is in flight right now.
1168      */
1169     if (interval_ms == 0) {
1170         return NULL;
1171     }
1172 
1173     if (ra_data->cur_cmd &&
1174         !pcmk_is_set(ra_data->cur_cmd->status, cmd_cancel) &&
1175         (ra_data->cur_cmd->interval_ms == interval_ms)
1176         && pcmk__str_eq(ra_data->cur_cmd->action, PCMK_ACTION_MONITOR,
1177                         pcmk__str_casei)) {
1178 
1179         cmd = ra_data->cur_cmd;
1180         goto handle_dup;
1181     }
1182 
1183     for (gIter = ra_data->recurring_cmds; gIter != NULL; gIter = gIter->next) {
1184         cmd = gIter->data;
1185         if ((cmd->interval_ms == interval_ms)
1186             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1187                             pcmk__str_casei)) {
1188             goto handle_dup;
1189         }
1190     }
1191 
1192     for (gIter = ra_data->cmds; gIter != NULL; gIter = gIter->next) {
1193         cmd = gIter->data;
1194         if ((cmd->interval_ms == interval_ms)
1195             && pcmk__str_eq(cmd->action, PCMK_ACTION_MONITOR,
1196                             pcmk__str_casei)) {
1197             goto handle_dup;
1198         }
1199     }
1200 
1201     return NULL;
1202 
1203 handle_dup:
1204 
1205     crm_trace("merging duplicate monitor cmd " PCMK__OP_FMT,
1206               cmd->rsc_id, PCMK_ACTION_MONITOR, interval_ms);
1207 
1208     /* update the userdata */
1209     if (userdata) {
1210        free(cmd->userdata);
1211        cmd->userdata = pcmk__str_copy(userdata);
1212     }
1213 
1214     /* if we've already reported success, generate a new call id */
1215     if (pcmk_is_set(cmd->status, cmd_reported_success)) {
1216         cmd->start_time = time(NULL);
1217         cmd->call_id = generate_callid();
1218         cmd_clear_flags(cmd, cmd_reported_success);
1219     }
1220 
1221     /* if we have an interval_id set, that means we are in the process of
1222      * waiting for this cmd's next interval. instead of waiting, cancel
1223      * the timer and execute the action immediately */
1224     if (cmd->interval_id) {
1225         g_source_remove(cmd->interval_id);
1226         cmd->interval_id = 0;
1227         recurring_helper(cmd);
1228     }
1229 
1230     return cmd;
1231 }
1232 
1233 /*!
1234  * \internal
1235  * \brief Execute an action using the (internal) ocf:pacemaker:remote agent
1236  *
1237  * \param[in]     lrm_state      Executor state object for remote connection
1238  * \param[in]     rsc_id         Connection resource ID
1239  * \param[in]     action         Action to execute
1240  * \param[in]     userdata       String to copy and pass to execution callback
1241  * \param[in]     interval_ms    Action interval (in milliseconds)
1242  * \param[in]     timeout_ms     Action timeout (in milliseconds)
1243  * \param[in]     start_delay_ms Delay (in milliseconds) before executing action
1244  * \param[in,out] params         Connection resource parameters
1245  * \param[out]    call_id        Where to store call ID on success
1246  *
1247  * \return Standard Pacemaker return code
1248  * \note This takes ownership of \p params, which should not be used or freed
1249  *       after calling this function.
1250  */
1251 int
1252 controld_execute_remote_agent(const lrm_state_t *lrm_state, const char *rsc_id,
     /*  */
1253                               const char *action, const char *userdata,
1254                               guint interval_ms, int timeout_ms,
1255                               int start_delay_ms, lrmd_key_value_t *params,
1256                               int *call_id)
1257 {
1258     lrm_state_t *connection_rsc = NULL;
1259     remote_ra_cmd_t *cmd = NULL;
1260     remote_ra_data_t *ra_data = NULL;
1261 
1262     *call_id = 0;
1263 
1264     CRM_CHECK((lrm_state != NULL) && (rsc_id != NULL) && (action != NULL)
1265               && (userdata != NULL) && (call_id != NULL),
1266               lrmd_key_value_freeall(params); return EINVAL);
1267 
1268     if (!is_remote_ra_supported_action(action)) {
1269         lrmd_key_value_freeall(params);
1270         return EOPNOTSUPP;
1271     }
1272 
1273     connection_rsc = lrm_state_find(rsc_id);
1274     if (connection_rsc == NULL) {
1275         lrmd_key_value_freeall(params);
1276         return ENOTCONN;
1277     }
1278 
1279     remote_ra_data_init(connection_rsc);
1280     ra_data = connection_rsc->remote_ra_data;
1281 
1282     cmd = handle_dup_monitor(ra_data, interval_ms, userdata);
1283     if (cmd) {
1284         *call_id = cmd->call_id;
1285         lrmd_key_value_freeall(params);
1286         return pcmk_rc_ok;
1287     }
1288 
1289     cmd = pcmk__assert_alloc(1, sizeof(remote_ra_cmd_t));
1290 
1291     cmd->owner = pcmk__str_copy(lrm_state->node_name);
1292     cmd->rsc_id = pcmk__str_copy(rsc_id);
1293     cmd->action = pcmk__str_copy(action);
1294     cmd->userdata = pcmk__str_copy(userdata);
1295     cmd->interval_ms = interval_ms;
1296     cmd->timeout = timeout_ms;
1297     cmd->start_delay = start_delay_ms;
1298     cmd->params = params;
1299     cmd->start_time = time(NULL);
1300 
1301     cmd->call_id = generate_callid();
1302 
1303     if (cmd->start_delay) {
1304         cmd->delay_id = g_timeout_add(cmd->start_delay, start_delay_helper, cmd);
1305     }
1306 
1307     ra_data->cmds = g_list_append(ra_data->cmds, cmd);
1308     mainloop_set_trigger(ra_data->work);
1309 
1310     *call_id = cmd->call_id;
1311     return pcmk_rc_ok;
1312 }
1313 
1314 /*!
1315  * \internal
1316  * \brief Immediately fail all monitors of a remote node, if proxied here
1317  *
1318  * \param[in] node_name  Name of pacemaker_remote node
1319  */
1320 void
1321 remote_ra_fail(const char *node_name)
     /*  */
1322 {
1323     lrm_state_t *lrm_state = lrm_state_find(node_name);
1324 
1325     if (lrm_state && lrm_state_is_connected(lrm_state)) {
1326         remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1327 
1328         crm_info("Failing monitors on Pacemaker Remote node %s", node_name);
1329         ra_data->recurring_cmds = fail_all_monitor_cmds(ra_data->recurring_cmds);
1330         ra_data->cmds = fail_all_monitor_cmds(ra_data->cmds);
1331     }
1332 }
1333 
1334 /* A guest node fencing implied by host fencing looks like:
1335  *
1336  *  <pseudo_event id="103" operation="stonith" operation_key="stonith-lxc1-off"
1337  *                on_node="lxc1" on_node_uuid="lxc1">
1338  *     <attributes CRM_meta_on_node="lxc1" CRM_meta_on_node_uuid="lxc1"
1339  *                 CRM_meta_stonith_action="off" crm_feature_set="3.0.12"/>
1340  *     <downed>
1341  *       <node id="lxc1"/>
1342  *     </downed>
1343  *  </pseudo_event>
1344  */
1345 #define XPATH_PSEUDO_FENCE "/" PCMK__XE_PSEUDO_EVENT \
1346     "[@" PCMK_XA_OPERATION "='stonith']/" PCMK__XE_DOWNED "/" PCMK_XE_NODE
1347 
1348 /*!
1349  * \internal
1350  * \brief Check a pseudo-action for Pacemaker Remote node side effects
1351  *
1352  * \param[in,out] xml  XML of pseudo-action to check
1353  */
1354 void
1355 remote_ra_process_pseudo(xmlNode *xml)
     /*  */
1356 {
1357     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_FENCE);
1358 
1359     if (numXpathResults(search) == 1) {
1360         xmlNode *result = getXpathResult(search, 0);
1361 
1362         /* Normally, we handle the necessary side effects of a guest node stop
1363          * action when reporting the remote agent's result. However, if the stop
1364          * is implied due to fencing, it will be a fencing pseudo-event, and
1365          * there won't be a result to report. Handle that case here.
1366          *
1367          * This will result in a duplicate call to remote_node_down() if the
1368          * guest stop was real instead of implied, but that shouldn't hurt.
1369          *
1370          * There is still one corner case that isn't handled: if a guest node
1371          * isn't running any resources when its host is fenced, it will appear
1372          * to be cleanly stopped, so there will be no pseudo-fence, and our
1373          * peer cache state will be incorrect unless and until the guest is
1374          * recovered.
1375          */
1376         if (result) {
1377             const char *remote = pcmk__xe_id(result);
1378 
1379             if (remote) {
1380                 remote_node_down(remote, DOWN_ERASE_LRM);
1381             }
1382         }
1383     }
1384     freeXpathObject(search);
1385 }
1386 
1387 static void
1388 remote_ra_maintenance(lrm_state_t * lrm_state, gboolean maintenance)
     /*  */
1389 {
1390     xmlNode *update, *state;
1391     int call_opt;
1392     crm_node_t *node;
1393 
1394     call_opt = crmd_cib_smart_opt();
1395     node = pcmk__cluster_lookup_remote_node(lrm_state->node_name);
1396     CRM_CHECK(node != NULL, return);
1397     update = pcmk__xe_create(NULL, PCMK_XE_STATUS);
1398     state = create_node_state_update(node, node_update_none, update,
1399                                      __func__);
1400     crm_xml_add(state, PCMK__XA_NODE_IN_MAINTENANCE, (maintenance? "1" : "0"));
1401     if (controld_update_cib(PCMK_XE_STATUS, update, call_opt,
1402                             NULL) == pcmk_rc_ok) {
1403         /* TODO: still not 100% sure that async update will succeed ... */
1404         if (maintenance) {
1405             lrm_remote_set_flags(lrm_state, remote_in_maint);
1406         } else {
1407             lrm_remote_clear_flags(lrm_state, remote_in_maint);
1408         }
1409     }
1410     free_xml(update);
1411 }
1412 
1413 #define XPATH_PSEUDO_MAINTENANCE "//" PCMK__XE_PSEUDO_EVENT         \
1414     "[@" PCMK_XA_OPERATION "='" PCMK_ACTION_MAINTENANCE_NODES "']/" \
1415     PCMK__XE_MAINTENANCE
1416 
1417 /*!
1418  * \internal
1419  * \brief Check a pseudo-action holding updates for maintenance state
1420  *
1421  * \param[in,out] xml  XML of pseudo-action to check
1422  */
1423 void
1424 remote_ra_process_maintenance_nodes(xmlNode *xml)
     /*  */
1425 {
1426     xmlXPathObjectPtr search = xpath_search(xml, XPATH_PSEUDO_MAINTENANCE);
1427 
1428     if (numXpathResults(search) == 1) {
1429         xmlNode *node;
1430         int cnt = 0, cnt_remote = 0;
1431 
1432         for (node = pcmk__xe_first_child(getXpathResult(search, 0),
1433                                          PCMK_XE_NODE, NULL, NULL);
1434              node != NULL; node = pcmk__xe_next_same(node)) {
1435 
1436             lrm_state_t *lrm_state = lrm_state_find(pcmk__xe_id(node));
1437 
1438             cnt++;
1439             if (lrm_state && lrm_state->remote_ra_data &&
1440                 pcmk_is_set(((remote_ra_data_t *) lrm_state->remote_ra_data)->status, remote_active)) {
1441 
1442                 const char *in_maint_s = NULL;
1443                 int in_maint;
1444 
1445                 cnt_remote++;
1446                 in_maint_s = crm_element_value(node,
1447                                                PCMK__XA_NODE_IN_MAINTENANCE);
1448                 pcmk__scan_min_int(in_maint_s, &in_maint, 0);
1449                 remote_ra_maintenance(lrm_state, in_maint);
1450             }
1451         }
1452         crm_trace("Action holds %d nodes (%d remotes found) adjusting "
1453                   PCMK_OPT_MAINTENANCE_MODE,
1454                   cnt, cnt_remote);
1455     }
1456     freeXpathObject(search);
1457 }
1458 
1459 gboolean
1460 remote_ra_is_in_maintenance(lrm_state_t * lrm_state)
     /*  */
1461 {
1462     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1463     return pcmk_is_set(ra_data->status, remote_in_maint);
1464 }
1465 
1466 gboolean
1467 remote_ra_controlling_guest(lrm_state_t * lrm_state)
     /*  */
1468 {
1469     remote_ra_data_t *ra_data = lrm_state->remote_ra_data;
1470     return pcmk_is_set(ra_data->status, controlling_guest);
1471 }
/* */
root/daemons/controld/controld_remote_ra.c

DEFINITIONS