root/lib/common/watchdog.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. panic_local_nonroot
  2. panic_local
  3. panic_sbd
  4. pcmk__panic
  5. pcmk__locate_sbd
  6. pcmk__get_sbd_watchdog_timeout
  7. pcmk__get_sbd_sync_resource_startup
  8. pcmk__auto_stonith_watchdog_timeout
  9. pcmk__valid_stonith_watchdog_timeout

   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU Lesser General Public License
   7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sched.h>
  13 #include <sys/ioctl.h>
  14 #include <sys/reboot.h>
  15 
  16 #include <sys/types.h>
  17 #include <sys/stat.h>
  18 #include <unistd.h>
  19 #include <ctype.h>
  20 #include <dirent.h>
  21 #include <signal.h>
  22 
  23 static pid_t sbd_pid = 0;
  24 
  25 /*!
  26  * \internal
  27  * \brief Tell pacemakerd to panic the local host
  28  *
  29  * \param[in] ppid  Process ID of parent process
  30  */
  31 static void
  32 panic_local_nonroot(pid_t ppid)
     /* [previous][next][first][last][top][bottom][index][help] */
  33 {
  34     if (ppid > 1) { // pacemakerd is still our parent
  35         crm_emerg("Escalating panic to " PCMK__SERVER_PACEMAKERD "[%lld]",
  36                   (long long) ppid);
  37     } else { // Signal (non-parent) pacemakerd if possible
  38         ppid = pcmk__procfs_pid_of(PCMK__SERVER_PACEMAKERD);
  39         if (ppid > 0) {
  40             union sigval signal_value;
  41 
  42             crm_emerg("Signaling " PCMK__SERVER_PACEMAKERD "[%lld] to panic",
  43                       (long long) ppid);
  44             memset(&signal_value, 0, sizeof(signal_value));
  45             if (sigqueue(ppid, SIGQUIT, signal_value) < 0) {
  46                 crm_emerg("Exiting after signal failure: %s", strerror(errno));
  47             }
  48         } else {
  49             crm_emerg("Exiting with no known " PCMK__SERVER_PACEMAKERD
  50                       "process");
  51         }
  52     }
  53     crm_exit(CRM_EX_PANIC);
  54 }
  55 
  56 /*!
  57  * \internal
  58  * \brief Panic the local host (if root) or tell pacemakerd to do so
  59  */
  60 static void
  61 panic_local(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  62 {
  63     const char *full_panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
  64     const char *panic_action = full_panic_action;
  65     int reboot_cmd = RB_AUTOBOOT; // Default panic action is reboot
  66 
  67     if (geteuid() != 0) { // Non-root caller such as the controller
  68         panic_local_nonroot(getppid());
  69         return;
  70     }
  71 
  72     if (pcmk__starts_with(full_panic_action, "sync-")) {
  73         panic_action += sizeof("sync-") - 1;
  74         sync();
  75     }
  76 
  77     if (pcmk__str_empty(full_panic_action)
  78         || pcmk__str_eq(panic_action, PCMK_VALUE_REBOOT, pcmk__str_none)) {
  79         pcmk__sysrq_trigger('b');
  80 
  81     } else if (pcmk__str_eq(panic_action, PCMK_VALUE_CRASH, pcmk__str_none)) {
  82         pcmk__sysrq_trigger('c');
  83 
  84     } else if (pcmk__str_eq(panic_action, PCMK_VALUE_OFF, pcmk__str_none)) {
  85         pcmk__sysrq_trigger('o');
  86 #ifdef RB_POWER_OFF
  87         reboot_cmd = RB_POWER_OFF;
  88 #elif defined(RB_POWEROFF)
  89         reboot_cmd = RB_POWEROFF;
  90 #endif
  91     } else {
  92         crm_warn("Using default '" PCMK_VALUE_REBOOT "' for local option PCMK_"
  93                  PCMK__ENV_PANIC_ACTION " because '%s' is not a valid value",
  94                  full_panic_action);
  95         pcmk__sysrq_trigger('b');
  96     }
  97 
  98     // sysrq failed or is not supported on this platform, so fall back to reboot
  99     reboot(reboot_cmd);
 100 
 101     // Even reboot failed, nothing left to do but exit
 102     crm_emerg("Exiting after reboot failed: %s", strerror(errno));
 103     if (getppid() > 1) { // pacemakerd is parent process
 104         crm_exit(CRM_EX_PANIC);
 105     } else { // This is pacemakerd, or an orphaned subdaemon
 106         crm_exit(CRM_EX_FATAL);
 107     }
 108 }
 109 
 110 /*!
 111  * \internal
 112  * \brief Tell sbd to kill the local host, then exit
 113  */
 114 static void
 115 panic_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 116 {
 117     union sigval signal_value;
 118     pid_t ppid = getppid();
 119 
 120     memset(&signal_value, 0, sizeof(signal_value));
 121     /* TODO: Arrange for a slightly less brutal option? */
 122     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
 123         crm_emerg("Panicking directly because couldn't signal sbd");
 124         panic_local();
 125     }
 126 
 127     if(ppid > 1) {
 128         /* child daemon */
 129         crm_exit(CRM_EX_PANIC);
 130     } else {
 131         /* pacemakerd or orphan child */
 132         crm_exit(CRM_EX_FATAL);
 133     }
 134 }
 135 
 136 /*!
 137  * \internal
 138  * \brief Panic the local host
 139  *
 140  * Panic the local host either by sbd (if running), directly, or by asking
 141  * pacemakerd. If trace logging this function, exit instead.
 142  *
 143  * \param[in] reason  Why panic is needed (for logging only)
 144  */
 145 void
 146 pcmk__panic(const char *reason)
     /* [previous][next][first][last][top][bottom][index][help] */
 147 {
 148     if (pcmk__locate_sbd() > 1) {
 149         crm_emerg("Signaling sbd[%lld] to panic the system: %s",
 150                   (long long) sbd_pid, reason);
 151         panic_sbd();
 152 
 153     } else {
 154         crm_emerg("Panicking the system directly: %s", reason);
 155         panic_local();
 156     }
 157 }
 158 
 159 /*!
 160  * \internal
 161  * \brief Return the process ID of sbd (or 0 if it is not running)
 162  */
 163 pid_t
 164 pcmk__locate_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 165 {
 166     const char *pidfile = PCMK__RUN_DIR "/sbd.pid";
 167     int rc;
 168 
 169     if(sbd_pid > 1) {
 170         return sbd_pid;
 171     }
 172 
 173     /* Read the pid file */
 174     rc = pcmk__pidfile_matches(pidfile, 0, SBIN_DIR "/sbd", &sbd_pid);
 175     if (rc == pcmk_rc_ok) {
 176         crm_trace("SBD detected at pid %lld (via PID file %s)",
 177                   (long long) sbd_pid, pidfile);
 178     } else {
 179         /* Fall back to /proc for systems that support it */
 180         sbd_pid = pcmk__procfs_pid_of("sbd");
 181 
 182         if (sbd_pid != 0) {
 183             crm_trace("SBD detected at pid %lld (via procfs)",
 184                       (long long) sbd_pid);
 185         }
 186     }
 187 
 188     if(sbd_pid < 0) {
 189         sbd_pid = 0;
 190         crm_trace("SBD not detected");
 191     }
 192 
 193     return sbd_pid;
 194 }
 195 
 196 long
 197 pcmk__get_sbd_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 198 {
 199     static long sbd_timeout = -2;
 200 
 201     if (sbd_timeout == -2) {
 202         sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
 203     }
 204     return sbd_timeout;
 205 }
 206 
 207 bool
 208 pcmk__get_sbd_sync_resource_startup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 209 {
 210     static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
 211     static bool checked_sync_resource_startup = false;
 212 
 213     if (!checked_sync_resource_startup) {
 214         const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
 215 
 216         if (sync_env == NULL) {
 217             crm_trace("Defaulting to %sstart-up synchronization with sbd",
 218                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
 219 
 220         } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
 221             crm_warn("Defaulting to %sstart-up synchronization with sbd "
 222                      "because environment value '%s' is invalid",
 223                      (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
 224         }
 225         checked_sync_resource_startup = true;
 226     }
 227     return sync_resource_startup != 0;
 228 }
 229 
 230 long
 231 pcmk__auto_stonith_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 232 {
 233     long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
 234 
 235     return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
 236 }
 237 
 238 bool
 239 pcmk__valid_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 240 {
 241     /* @COMPAT At a compatibility break, accept either negative values or a
 242      * specific string like "auto" (but not both) to mean "auto-calculate the
 243      * timeout." Reject other values that aren't parsable as timeouts.
 244      */
 245     long st_timeout = value? crm_get_msec(value) : 0;
 246 
 247     if (st_timeout < 0) {
 248         st_timeout = pcmk__auto_stonith_watchdog_timeout();
 249         crm_debug("Using calculated value %ld for "
 250                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " (%s)",
 251                   st_timeout, value);
 252     }
 253 
 254     if (st_timeout == 0) {
 255         crm_debug("Watchdog may be enabled but "
 256                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
 257                   value? value : "default");
 258 
 259     } else if (pcmk__locate_sbd() == 0) {
 260         crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 261                   " configured (%s) but SBD not active",
 262                   pcmk__s(value, "auto"));
 263         crm_exit(CRM_EX_FATAL);
 264         return false;
 265 
 266     } else {
 267         long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
 268 
 269         if (st_timeout < sbd_timeout) {
 270             crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 271                       " (%s) too short (must be >%ldms)",
 272                       value, sbd_timeout);
 273             crm_exit(CRM_EX_FATAL);
 274             return false;
 275         }
 276         crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 277                  " %s and SBD timeout %ldms",
 278                  value, sbd_timeout);
 279     }
 280     return true;
 281 }

/* [previous][next][first][last][top][bottom][index][help] */