This source file includes following definitions.
- pcmkd_cluster_connected
- check_next_subdaemon
- escalate_shutdown
- pcmk_child_exit
- pcmk_process_exit
- pcmk_shutdown_worker
- start_child
- child_liveness
- find_and_track_existing_processes
- init_children_processes
- pcmk_shutdown
- restart_cluster_subdaemons
- stop_child
   1 
   2 
   3 
   4 
   5 
   6 
   7 
   8 
   9 
  10 #include <crm_internal.h>
  11 #include "pacemakerd.h"
  12 
  13 #include <errno.h>
  14 #include <grp.h>
  15 #include <signal.h>
  16 #include <stdbool.h>
  17 #include <stdlib.h>
  18 #include <string.h>
  19 #include <sys/types.h>
  20 #include <time.h>
  21 #include <unistd.h>
  22 
  23 #include <crm/cluster.h>
  24 #include <crm/msg_xml.h>
  25 
  26 typedef struct pcmk_child_s {
  27     pid_t pid;
  28     int respawn_count;
  29     bool respawn;
  30     const char *name;
  31     const char *uid;
  32     const char *command;
  33     const char *endpoint;  
  34     bool needs_cluster;
  35     int check_count;
  36 
  37     
  38     bool needs_retry;
  39     bool active_before_startup;
  40 } pcmk_child_t;
  41 
  42 #define PCMK_PROCESS_CHECK_INTERVAL 1
  43 #define PCMK_PROCESS_CHECK_RETRIES  5
  44 #define SHUTDOWN_ESCALATION_PERIOD  180000  
  45 
  46 
  47 #define PCMK_CHILD_CONTROLD  5
  48 
  49 static pcmk_child_t pcmk_children[] = {
  50     {
  51         0, 0, true,  "pacemaker-based", CRM_DAEMON_USER,
  52         CRM_DAEMON_DIR "/pacemaker-based", PCMK__SERVER_BASED_RO,
  53         true
  54     },
  55     {
  56         0, 0, true, "pacemaker-fenced", NULL,
  57         CRM_DAEMON_DIR "/pacemaker-fenced", "stonith-ng",
  58         true
  59     },
  60     {
  61         0, 0, true,  "pacemaker-execd", NULL,
  62         CRM_DAEMON_DIR "/pacemaker-execd", CRM_SYSTEM_LRMD,
  63         false
  64     },
  65     {
  66         0, 0, true, "pacemaker-attrd", CRM_DAEMON_USER,
  67         CRM_DAEMON_DIR "/pacemaker-attrd", T_ATTRD,
  68         true
  69     },
  70     {
  71         0, 0, true, "pacemaker-schedulerd", CRM_DAEMON_USER,
  72         CRM_DAEMON_DIR "/pacemaker-schedulerd", CRM_SYSTEM_PENGINE,
  73         false
  74     },
  75     {
  76         0, 0, true, "pacemaker-controld", CRM_DAEMON_USER,
  77         CRM_DAEMON_DIR "/pacemaker-controld", CRM_SYSTEM_CRMD,
  78         true
  79     },
  80 };
  81 
  82 static char *opts_default[] = { NULL, NULL };
  83 static char *opts_vgrind[] = { NULL, NULL, NULL, NULL, NULL };
  84 
  85 crm_trigger_t *shutdown_trigger = NULL;
  86 crm_trigger_t *startup_trigger = NULL;
  87 time_t subdaemon_check_progress = 0;
  88 
  89 
  90 static bool need_root_group = true;
  91 
  92 
  93 
  94 
  95 
  96 
  97 
  98 
  99 
 100 
 101 
 102 unsigned int shutdown_complete_state_reported_to = 0;
 103 gboolean shutdown_complete_state_reported_client_closed = FALSE;
 104 
 105 
 106 const char *pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_INIT;
 107 gboolean running_with_sbd = FALSE; 
 108 
 109 GMainLoop *mainloop = NULL;
 110 
 111 static gboolean fatal_error = FALSE;
 112 
 113 static int child_liveness(pcmk_child_t *child);
 114 static gboolean escalate_shutdown(gpointer data);
 115 static int start_child(pcmk_child_t * child);
 116 static void pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode);
 117 static void pcmk_process_exit(pcmk_child_t * child);
 118 static gboolean pcmk_shutdown_worker(gpointer user_data);
 119 static gboolean stop_child(pcmk_child_t * child, int signal);
 120 
 121 static bool
 122 pcmkd_cluster_connected(void)
     
 123 {
 124 #if SUPPORT_COROSYNC
 125     return pcmkd_corosync_connected();
 126 #else
 127     return true;
 128 #endif
 129 }
 130 
 131 static gboolean
 132 check_next_subdaemon(gpointer user_data)
     
 133 {
 134     static int next_child = 0;
 135     int rc = child_liveness(&pcmk_children[next_child]);
 136 
 137     crm_trace("Checked %s[%lld]: %s (%d)",
 138               pcmk_children[next_child].name,
 139               (long long) PCMK__SPECIAL_PID_AS_0(pcmk_children[next_child].pid),
 140               pcmk_rc_str(rc), rc);
 141 
 142     switch (rc) {
 143         case pcmk_rc_ok:
 144             pcmk_children[next_child].check_count = 0;
 145             subdaemon_check_progress = time(NULL);
 146             break;
 147         case pcmk_rc_ipc_pid_only: 
 148             pcmk_children[next_child].check_count++;
 149             if (pcmk_children[next_child].check_count >= PCMK_PROCESS_CHECK_RETRIES) {
 150                 crm_err("%s[%lld] is unresponsive to ipc after %d tries but "
 151                         "we found the pid so have it killed that we can restart",
 152                         pcmk_children[next_child].name,
 153                         (long long) PCMK__SPECIAL_PID_AS_0(
 154                             pcmk_children[next_child].pid),
 155                         pcmk_children[next_child].check_count);
 156                 stop_child(&pcmk_children[next_child], SIGKILL);
 157                 if (pcmk_children[next_child].respawn) {
 158                     
 159 
 160 
 161                     pcmk_children[next_child].check_count = 0;
 162                 }
 163             } else {
 164                 crm_notice("%s[%lld] is unresponsive to ipc after %d tries",
 165                         pcmk_children[next_child].name,
 166                         (long long) PCMK__SPECIAL_PID_AS_0(
 167                             pcmk_children[next_child].pid),
 168                         pcmk_children[next_child].check_count);
 169                 if (pcmk_children[next_child].respawn) {
 170                     
 171 
 172 
 173 
 174 
 175                     subdaemon_check_progress = time(NULL);
 176                 }
 177             }
 178             
 179 
 180 
 181             break;
 182         case pcmk_rc_ipc_unresponsive:
 183             if (!pcmk_children[next_child].respawn) {
 184                 
 185 
 186 
 187 
 188 
 189 
 190                 if (pcmk_children[next_child].respawn_count <= MAX_RESPAWN) {
 191                     subdaemon_check_progress = time(NULL);
 192                 }
 193             }
 194             if (!pcmk_children[next_child].active_before_startup) {
 195                 crm_trace("found %s[%lld] missing - signal-handler "
 196                           "will take care of it",
 197                            pcmk_children[next_child].name,
 198                            (long long) PCMK__SPECIAL_PID_AS_0(
 199                             pcmk_children[next_child].pid));
 200                 break;
 201             }
 202             if (pcmk_children[next_child].respawn) {
 203                 crm_err("%s[%lld] terminated",
 204                         pcmk_children[next_child].name,
 205                         (long long) PCMK__SPECIAL_PID_AS_0(
 206                             pcmk_children[next_child].pid));
 207             } else {
 208                 
 209                 crm_notice("%s[%lld] terminated",
 210                            pcmk_children[next_child].name,
 211                            (long long) PCMK__SPECIAL_PID_AS_0(
 212                                 pcmk_children[next_child].pid));
 213             }
 214             pcmk_process_exit(&(pcmk_children[next_child]));
 215             break;
 216         default:
 217             crm_exit(CRM_EX_FATAL);
 218             break;  
 219     }
 220 
 221     next_child++;
 222     if (next_child >= PCMK__NELEM(pcmk_children)) {
 223         next_child = 0;
 224     }
 225 
 226     return G_SOURCE_CONTINUE;
 227 }
 228 
 229 static gboolean
 230 escalate_shutdown(gpointer data)
     
 231 {
 232     pcmk_child_t *child = data;
 233 
 234     if (child->pid == PCMK__SPECIAL_PID) {
 235         pcmk_process_exit(child);
 236 
 237     } else if (child->pid != 0) {
 238         
 239         crm_err("Child %s not terminating in a timely manner, forcing", child->name);
 240         stop_child(child, SIGSEGV);
 241     }
 242     return FALSE;
 243 }
 244 
 245 static void
 246 pcmk_child_exit(mainloop_child_t * p, pid_t pid, int core, int signo, int exitcode)
     
 247 {
 248     pcmk_child_t *child = mainloop_child_userdata(p);
 249     const char *name = mainloop_child_name(p);
 250 
 251     if (signo) {
 252         do_crm_log(((signo == SIGKILL)? LOG_WARNING : LOG_ERR),
 253                    "%s[%d] terminated with signal %d (%s)%s",
 254                    name, pid, signo, strsignal(signo),
 255                    (core? " and dumped core" : ""));
 256 
 257     } else {
 258         switch(exitcode) {
 259             case CRM_EX_OK:
 260                 crm_info("%s[%d] exited with status %d (%s)",
 261                          name, pid, exitcode, crm_exit_str(exitcode));
 262                 break;
 263 
 264             case CRM_EX_FATAL:
 265                 crm_warn("Shutting cluster down because %s[%d] had fatal failure",
 266                          name, pid);
 267                 child->respawn = false;
 268                 fatal_error = TRUE;
 269                 pcmk_shutdown(SIGTERM);
 270                 break;
 271 
 272             case CRM_EX_PANIC:
 273                 crm_emerg("%s[%d] instructed the machine to reset", name, pid);
 274                 child->respawn = false;
 275                 fatal_error = TRUE;
 276                 pcmk__panic(__func__);
 277                 pcmk_shutdown(SIGTERM);
 278                 break;
 279 
 280             default:
 281                 crm_err("%s[%d] exited with status %d (%s)",
 282                         name, pid, exitcode, crm_exit_str(exitcode));
 283                 break;
 284         }
 285     }
 286 
 287     pcmk_process_exit(child);
 288 }
 289 
 290 static void
 291 pcmk_process_exit(pcmk_child_t * child)
     
 292 {
 293     child->pid = 0;
 294     child->active_before_startup = false;
 295     child->check_count = 0;
 296 
 297     child->respawn_count += 1;
 298     if (child->respawn_count > MAX_RESPAWN) {
 299         crm_err("Child respawn count exceeded by %s", child->name);
 300         child->respawn = false;
 301     }
 302 
 303     if (shutdown_trigger) {
 304         
 305         mainloop_set_trigger(shutdown_trigger);
 306 
 307     } else if (!child->respawn) {
 308         
 309 
 310     } else if (crm_is_true(getenv("PCMK_fail_fast"))) {
 311         crm_err("Rebooting system because of %s", child->name);
 312         pcmk__panic(__func__);
 313 
 314     } else if (child_liveness(child) == pcmk_rc_ok) {
 315         crm_warn("One-off suppressing strict respawning of a child process %s,"
 316                  " appears alright per %s IPC end-point",
 317                  child->name, child->endpoint);
 318 
 319     } else if (child->needs_cluster && !pcmkd_cluster_connected()) {
 320         crm_notice("Not respawning %s subdaemon until cluster returns",
 321                    child->name);
 322         child->needs_retry = true;
 323 
 324     } else {
 325         crm_notice("Respawning %s subdaemon after unexpected exit",
 326                    child->name);
 327         start_child(child);
 328     }
 329 }
 330 
 331 static gboolean
 332 pcmk_shutdown_worker(gpointer user_data)
     
 333 {
 334     static int phase = PCMK__NELEM(pcmk_children) - 1;
 335     static time_t next_log = 0;
 336 
 337     if (phase == PCMK__NELEM(pcmk_children) - 1) {
 338         crm_notice("Shutting down Pacemaker");
 339         pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTTINGDOWN;
 340     }
 341 
 342     for (; phase >= 0; phase--) {
 343         pcmk_child_t *child = &(pcmk_children[phase]);
 344 
 345         if (child->pid != 0) {
 346             time_t now = time(NULL);
 347 
 348             if (child->respawn) {
 349                 if (child->pid == PCMK__SPECIAL_PID) {
 350                     crm_warn("The process behind %s IPC cannot be"
 351                              " terminated, so either wait the graceful"
 352                              " period of %ld s for its native termination"
 353                              " if it vitally depends on some other daemons"
 354                              " going down in a controlled way already,"
 355                              " or locate and kill the correct %s process"
 356                              " on your own; set PCMK_fail_fast=1 to avoid"
 357                              " this altogether next time around",
 358                              child->name, (long) SHUTDOWN_ESCALATION_PERIOD,
 359                              child->command);
 360                 }
 361                 next_log = now + 30;
 362                 child->respawn = false;
 363                 stop_child(child, SIGTERM);
 364                 if (phase < PCMK_CHILD_CONTROLD) {
 365                     g_timeout_add(SHUTDOWN_ESCALATION_PERIOD,
 366                                   escalate_shutdown, child);
 367                 }
 368 
 369             } else if (now >= next_log) {
 370                 next_log = now + 30;
 371                 crm_notice("Still waiting for %s to terminate "
 372                            CRM_XS " pid=%lld",
 373                            child->name, (long long) child->pid);
 374             }
 375             return TRUE;
 376         }
 377 
 378         
 379         crm_debug("%s confirmed stopped", child->name);
 380         child->pid = 0;
 381     }
 382 
 383     crm_notice("Shutdown complete");
 384     pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_SHUTDOWNCOMPLETE;
 385     if (!fatal_error && running_with_sbd &&
 386         pcmk__get_sbd_sync_resource_startup() &&
 387         !shutdown_complete_state_reported_client_closed) {
 388         crm_notice("Waiting for SBD to pick up shutdown-complete-state.");
 389         return TRUE;
 390     }
 391 
 392     {
 393         const char *delay = pcmk__env_option(PCMK__ENV_SHUTDOWN_DELAY);
 394         if(delay) {
 395             sync();
 396             pcmk__sleep_ms(crm_get_msec(delay));
 397         }
 398     }
 399 
 400     g_main_loop_quit(mainloop);
 401 
 402     if (fatal_error) {
 403         crm_notice("Shutting down and staying down after fatal error");
 404 #ifdef SUPPORT_COROSYNC
 405         pcmkd_shutdown_corosync();
 406 #endif
 407         crm_exit(CRM_EX_FATAL);
 408     }
 409 
 410     return TRUE;
 411 }
 412 
 413 
 414 
 415 
 416 
 417 
 418  
 419 static int
 420 start_child(pcmk_child_t * child)
     
 421 {
 422     uid_t uid = 0;
 423     gid_t gid = 0;
 424     gboolean use_valgrind = FALSE;
 425     gboolean use_callgrind = FALSE;
 426     const char *env_valgrind = getenv("PCMK_valgrind_enabled");
 427     const char *env_callgrind = getenv("PCMK_callgrind_enabled");
 428 
 429     child->active_before_startup = false;
 430     child->check_count = 0;
 431 
 432     if (child->command == NULL) {
 433         crm_info("Nothing to do for child \"%s\"", child->name);
 434         return pcmk_rc_ok;
 435     }
 436 
 437     if (env_callgrind != NULL && crm_is_true(env_callgrind)) {
 438         use_callgrind = TRUE;
 439         use_valgrind = TRUE;
 440 
 441     } else if (env_callgrind != NULL && strstr(env_callgrind, child->name)) {
 442         use_callgrind = TRUE;
 443         use_valgrind = TRUE;
 444 
 445     } else if (env_valgrind != NULL && crm_is_true(env_valgrind)) {
 446         use_valgrind = TRUE;
 447 
 448     } else if (env_valgrind != NULL && strstr(env_valgrind, child->name)) {
 449         use_valgrind = TRUE;
 450     }
 451 
 452     if (use_valgrind && strlen(VALGRIND_BIN) == 0) {
 453         crm_warn("Cannot enable valgrind for %s:"
 454                  " The location of the valgrind binary is unknown", child->name);
 455         use_valgrind = FALSE;
 456     }
 457 
 458     if (child->uid) {
 459         if (crm_user_lookup(child->uid, &uid, &gid) < 0) {
 460             crm_err("Invalid user (%s) for %s: not found", child->uid, child->name);
 461             return EACCES;
 462         }
 463         crm_info("Using uid=%u and group=%u for process %s", uid, gid, child->name);
 464     }
 465 
 466     child->pid = fork();
 467     CRM_ASSERT(child->pid != -1);
 468 
 469     if (child->pid > 0) {
 470         
 471         mainloop_child_add(child->pid, 0, child->name, child, pcmk_child_exit);
 472 
 473         crm_info("Forked child %lld for process %s%s",
 474                  (long long) child->pid, child->name,
 475                  use_valgrind ? " (valgrind enabled: " VALGRIND_BIN ")" : "");
 476         return pcmk_rc_ok;
 477 
 478     } else {
 479         
 480         (void)setsid();
 481 
 482         
 483         opts_vgrind[0] = strdup(VALGRIND_BIN);
 484         if (use_callgrind) {
 485             opts_vgrind[1] = strdup("--tool=callgrind");
 486             opts_vgrind[2] = strdup("--callgrind-out-file=" CRM_STATE_DIR "/callgrind.out.%p");
 487             opts_vgrind[3] = strdup(child->command);
 488             opts_vgrind[4] = NULL;
 489         } else {
 490             opts_vgrind[1] = strdup(child->command);
 491             opts_vgrind[2] = NULL;
 492             opts_vgrind[3] = NULL;
 493             opts_vgrind[4] = NULL;
 494         }
 495         opts_default[0] = strdup(child->command);
 496 
 497         if(gid) {
 498             
 499             if (!need_root_group && (setgid(gid) < 0)) {
 500                 crm_warn("Could not set group to %d: %s", gid, strerror(errno));
 501             }
 502 
 503             
 504 
 505 
 506             if (initgroups(child->uid, gid) < 0) {
 507                 crm_err("Cannot initialize groups for %s: %s (%d)",
 508                         child->uid, pcmk_rc_str(errno), errno);
 509             }
 510         }
 511 
 512         if (uid && setuid(uid) < 0) {
 513             crm_warn("Could not set user to %s (id %d): %s",
 514                      child->uid, uid, strerror(errno));
 515         }
 516 
 517         pcmk__close_fds_in_child(true);
 518 
 519         pcmk__open_devnull(O_RDONLY);   
 520         pcmk__open_devnull(O_WRONLY);   
 521         pcmk__open_devnull(O_WRONLY);   
 522 
 523         if (use_valgrind) {
 524             (void)execvp(VALGRIND_BIN, opts_vgrind);
 525         } else {
 526             (void)execvp(child->command, opts_default);
 527         }
 528         crm_crit("Could not execute %s: %s", child->command, strerror(errno));
 529         crm_exit(CRM_EX_FATAL);
 530     }
 531     return pcmk_rc_ok;          
 532 }
 533 
 534 
 535 
 536 
 537 
 538 
 539 
 540 
 541 
 542 
 543 
 544 
 545 
 546 
 547 
 548 
 549 
 550 
 551 
 552 static int
 553 child_liveness(pcmk_child_t *child)
     
 554 {
 555     uid_t cl_uid = 0;
 556     gid_t cl_gid = 0;
 557     const uid_t root_uid = 0;
 558     const gid_t root_gid = 0;
 559     const uid_t *ref_uid;
 560     const gid_t *ref_gid;
 561     int rc = pcmk_rc_ipc_unresponsive;
 562     pid_t ipc_pid = 0;
 563 
 564     if (child->endpoint == NULL
 565             && (child->pid <= 0 || child->pid == PCMK__SPECIAL_PID)) {
 566         crm_err("Cannot track child %s for missing both API end-point and PID",
 567                 child->name);
 568         rc = EINVAL; 
 569 
 570     } else if (child->endpoint != NULL) {
 571         int legacy_rc = pcmk_ok;
 572 
 573         if (child->uid == NULL) {
 574             ref_uid = &root_uid;
 575             ref_gid = &root_gid;
 576         } else {
 577             ref_uid = &cl_uid;
 578             ref_gid = &cl_gid;
 579             legacy_rc = pcmk_daemon_user(&cl_uid, &cl_gid);
 580         }
 581 
 582         if (legacy_rc < 0) {
 583             rc = pcmk_legacy2rc(legacy_rc);
 584             crm_err("Could not find user and group IDs for user %s: %s "
 585                     CRM_XS " rc=%d", CRM_DAEMON_USER, pcmk_rc_str(rc), rc);
 586         } else {
 587             rc = pcmk__ipc_is_authentic_process_active(child->endpoint,
 588                                                        *ref_uid, *ref_gid,
 589                                                        &ipc_pid);
 590             if ((rc == pcmk_rc_ok) || (rc == pcmk_rc_ipc_unresponsive)) {
 591                 if (child->pid <= 0) {
 592                     
 593 
 594 
 595 
 596 
 597                     child->pid = ipc_pid;
 598                 } else if ((ipc_pid != 0) && (child->pid != ipc_pid)) {
 599                     
 600 
 601 
 602                     rc = pcmk_rc_ipc_unresponsive;
 603                 }
 604             }
 605         }
 606     }
 607 
 608     if (rc == pcmk_rc_ipc_unresponsive) {
 609         
 610 
 611 
 612 
 613 
 614 
 615         int ret = pcmk__pid_active(child->pid, child->name);
 616 
 617         if (ipc_pid && ((ret != pcmk_rc_ok)
 618                         || ipc_pid == PCMK__SPECIAL_PID
 619                         || (pcmk__pid_active(ipc_pid,
 620                                              child->name) == pcmk_rc_ok))) {
 621             
 622 
 623 
 624 
 625 
 626             if (ret == pcmk_rc_ok) {
 627                 
 628 
 629 
 630 
 631 
 632 
 633 
 634                 
 635                 stop_child(child, SIGKILL);
 636             }
 637             rc = pcmk_rc_ok;
 638             child->pid = ipc_pid;
 639         } else if (ret == pcmk_rc_ok) {
 640             
 641             rc = pcmk_rc_ipc_pid_only;
 642         } else if ((child->pid == 0) && (ret == EINVAL)) {
 643             
 644             rc = pcmk_rc_ipc_unresponsive;
 645         } else {
 646             switch (ret) {
 647                 case EACCES:
 648                     rc = pcmk_rc_ipc_unauthorized;
 649                     break;
 650                 case ESRCH:
 651                     rc = pcmk_rc_ipc_unresponsive;
 652                     break;
 653                 default:
 654                     rc = ret;
 655                     break;
 656             }
 657         }
 658     }
 659     return rc;
 660 }
 661 
 662 
 663 
 664 
 665 
 666 
 667 
 668 
 669 
 670 
 671 
 672 
 673 
 674 
 675 
 676 
 677 
 678 
 679 
 680 
 681 
 682 
 683 
 684 
 685 
 686 
 687 
 688 
 689 #define WAIT_TRIES 4  
 690 int
 691 find_and_track_existing_processes(void)
     
 692 {
 693     bool wait_in_progress;
 694     int rc;
 695     size_t i, rounds;
 696 
 697     for (rounds = 1; rounds <= WAIT_TRIES; rounds++) {
 698         wait_in_progress = false;
 699         for (i = 0; i < PCMK__NELEM(pcmk_children); i++) {
 700 
 701             if ((pcmk_children[i].endpoint == NULL)
 702                 || (pcmk_children[i].respawn_count < 0)) {
 703                 continue;
 704             }
 705 
 706             rc = child_liveness(&pcmk_children[i]);
 707             if (rc == pcmk_rc_ipc_unresponsive) {
 708                 
 709 
 710 
 711 
 712                 continue;
 713             }
 714 
 715             pcmk_children[i].respawn_count = rounds;
 716             switch (rc) {
 717                 case pcmk_rc_ok:
 718                     if (pcmk_children[i].pid == PCMK__SPECIAL_PID) {
 719                         if (crm_is_true(getenv("PCMK_fail_fast"))) {
 720                             crm_crit("Cannot reliably track pre-existing"
 721                                      " authentic process behind %s IPC on this"
 722                                      " platform and PCMK_fail_fast requested",
 723                                      pcmk_children[i].endpoint);
 724                             return EOPNOTSUPP;
 725                         } else if (pcmk_children[i].respawn_count == WAIT_TRIES) {
 726                             crm_notice("Assuming pre-existing authentic, though"
 727                                        " on this platform untrackable, process"
 728                                        " behind %s IPC is stable (was in %d"
 729                                        " previous samples) so rather than"
 730                                        " bailing out (PCMK_fail_fast not"
 731                                        " requested), we just switch to a less"
 732                                        " optimal IPC liveness monitoring"
 733                                        " (not very suitable for heavy load)",
 734                                        pcmk_children[i].name, WAIT_TRIES - 1);
 735                             crm_warn("The process behind %s IPC cannot be"
 736                                      " terminated, so the overall shutdown"
 737                                      " will get delayed implicitly (%ld s),"
 738                                      " which serves as a graceful period for"
 739                                      " its native termination if it vitally"
 740                                      " depends on some other daemons going"
 741                                      " down in a controlled way already",
 742                                      pcmk_children[i].name,
 743                                      (long) SHUTDOWN_ESCALATION_PERIOD);
 744                         } else {
 745                             wait_in_progress = true;
 746                             crm_warn("Cannot reliably track pre-existing"
 747                                      " authentic process behind %s IPC on this"
 748                                      " platform, can still disappear in %d"
 749                                      " attempt(s)", pcmk_children[i].endpoint,
 750                                      WAIT_TRIES - pcmk_children[i].respawn_count);
 751                             continue;
 752                         }
 753                     }
 754                     crm_notice("Tracking existing %s process (pid=%lld)",
 755                                pcmk_children[i].name,
 756                                (long long) PCMK__SPECIAL_PID_AS_0(
 757                                                pcmk_children[i].pid));
 758                     pcmk_children[i].respawn_count = -1;  
 759                     pcmk_children[i].active_before_startup = true;
 760                     break;
 761                 case pcmk_rc_ipc_pid_only:
 762                     if (pcmk_children[i].respawn_count == WAIT_TRIES) {
 763                         crm_crit("%s IPC end-point for existing authentic"
 764                                  " process %lld did not (re)appear",
 765                                  pcmk_children[i].endpoint,
 766                                  (long long) PCMK__SPECIAL_PID_AS_0(
 767                                                  pcmk_children[i].pid));
 768                         return rc;
 769                     }
 770                     wait_in_progress = true;
 771                     crm_warn("Cannot find %s IPC end-point for existing"
 772                              " authentic process %lld, can still (re)appear"
 773                              " in %d attempts (?)",
 774                              pcmk_children[i].endpoint,
 775                              (long long) PCMK__SPECIAL_PID_AS_0(
 776                                              pcmk_children[i].pid),
 777                              WAIT_TRIES - pcmk_children[i].respawn_count);
 778                     continue;
 779                 default:
 780                     crm_crit("Checked liveness of %s: %s " CRM_XS " rc=%d",
 781                              pcmk_children[i].name, pcmk_rc_str(rc), rc);
 782                     return rc;
 783             }
 784         }
 785         if (!wait_in_progress) {
 786             break;
 787         }
 788         pcmk__sleep_ms(250); 
 789     }
 790     for (i = 0; i < PCMK__NELEM(pcmk_children); i++) {
 791         pcmk_children[i].respawn_count = 0;  
 792     }
 793 
 794     g_timeout_add_seconds(PCMK_PROCESS_CHECK_INTERVAL, check_next_subdaemon,
 795                           NULL);
 796     return pcmk_rc_ok;
 797 }
 798 
 799 gboolean
 800 init_children_processes(void *user_data)
     
 801 {
 802     if (is_corosync_cluster()) {
 803         
 804 
 805 
 806 
 807         need_root_group = false;
 808     }
 809 
 810     
 811     for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
 812         if (pcmk_children[i].pid != 0) {
 813             
 814             continue;
 815         }
 816 
 817         start_child(&(pcmk_children[i]));
 818     }
 819 
 820     
 821 
 822 
 823 
 824 
 825     setenv("PCMK_respawned", "true", 1);
 826     pacemakerd_state = XML_PING_ATTR_PACEMAKERDSTATE_RUNNING;
 827     return TRUE;
 828 }
 829 
 830 void
 831 pcmk_shutdown(int nsig)
     
 832 {
 833     if (shutdown_trigger == NULL) {
 834         shutdown_trigger = mainloop_add_trigger(G_PRIORITY_HIGH, pcmk_shutdown_worker, NULL);
 835     }
 836     mainloop_set_trigger(shutdown_trigger);
 837 }
 838 
 839 void
 840 restart_cluster_subdaemons(void)
     
 841 {
 842     for (int i = 0; i < PCMK__NELEM(pcmk_children); i++) {
 843         if (!pcmk_children[i].needs_retry || pcmk_children[i].pid != 0) {
 844             continue;
 845         }
 846 
 847         crm_notice("Respawning cluster-based subdaemon: %s", pcmk_children[i].name);
 848         if (start_child(&pcmk_children[i])) {
 849             pcmk_children[i].needs_retry = false;
 850         }
 851     }
 852 }
 853 
 854 static gboolean
 855 stop_child(pcmk_child_t * child, int signal)
     
 856 {
 857     if (signal == 0) {
 858         signal = SIGTERM;
 859     }
 860 
 861     
 862 
 863 
 864 
 865     if (child->command == NULL || child->pid == PCMK__SPECIAL_PID) {
 866         crm_debug("Nothing to do for child \"%s\" (process %lld)",
 867                   child->name, (long long) PCMK__SPECIAL_PID_AS_0(child->pid));
 868         return TRUE;
 869     }
 870 
 871     if (child->pid <= 0) {
 872         crm_trace("Client %s not running", child->name);
 873         return TRUE;
 874     }
 875 
 876     errno = 0;
 877     if (kill(child->pid, signal) == 0) {
 878         crm_notice("Stopping %s "CRM_XS" sent signal %d to process %lld",
 879                    child->name, signal, (long long) child->pid);
 880 
 881     } else {
 882         crm_err("Could not stop %s (process %lld) with signal %d: %s",
 883                 child->name, (long long) child->pid, signal, strerror(errno));
 884     }
 885 
 886     return TRUE;
 887 }
 888