root/lib/common/watchdog.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sysrq_trigger
  2. panic_local_nonroot
  3. panic_local
  4. panic_sbd
  5. pcmk__panic
  6. pcmk__locate_sbd
  7. pcmk__get_sbd_watchdog_timeout
  8. pcmk__get_sbd_sync_resource_startup
  9. pcmk__auto_stonith_watchdog_timeout
  10. pcmk__valid_stonith_watchdog_timeout

   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU Lesser General Public License
   7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sched.h>
  13 #include <sys/ioctl.h>
  14 #include <sys/reboot.h>
  15 
  16 #include <sys/types.h>
  17 #include <sys/stat.h>
  18 #include <unistd.h>
  19 #include <ctype.h>
  20 #include <dirent.h>
  21 #include <signal.h>
  22 
  23 static pid_t sbd_pid = 0;
  24 
  25 /*!
  26  * \internal
  27  * \brief Trigger a sysrq command if supported on current platform
  28  *
  29  * \param[in] t  Sysrq command to trigger
  30  */
  31 static void
  32 sysrq_trigger(char t)
     /* [previous][next][first][last][top][bottom][index][help] */
  33 {
  34 #if HAVE_LINUX_PROCFS
  35     // Root can always write here, regardless of kernel.sysrq value
  36     FILE *procf = fopen("/proc/sysrq-trigger", "a");
  37 
  38     if (procf == NULL) {
  39         crm_warn("Could not open sysrq-trigger: %s", strerror(errno));
  40     } else {
  41         fprintf(procf, "%c\n", t);
  42         fclose(procf);
  43     }
  44 #endif // HAVE_LINUX_PROCFS
  45 }
  46 
  47 /*!
  48  * \internal
  49  * \brief Tell pacemakerd to panic the local host
  50  *
  51  * \param[in] ppid  Process ID of parent process
  52  */
  53 static void
  54 panic_local_nonroot(pid_t ppid)
     /* [previous][next][first][last][top][bottom][index][help] */
  55 {
  56     if (ppid > 1) { // pacemakerd is still our parent
  57         crm_emerg("Escalating panic to " PCMK__SERVER_PACEMAKERD "[%lld]",
  58                   (long long) ppid);
  59     } else { // Signal (non-parent) pacemakerd if possible
  60 #if HAVE_LINUX_PROCFS
  61         ppid = pcmk__procfs_pid_of(PCMK__SERVER_PACEMAKERD);
  62         if (ppid > 0) {
  63             union sigval signal_value;
  64 
  65             crm_emerg("Signaling " PCMK__SERVER_PACEMAKERD "[%lld] to panic",
  66                       (long long) ppid);
  67             memset(&signal_value, 0, sizeof(signal_value));
  68             if (sigqueue(ppid, SIGQUIT, signal_value) < 0) {
  69                 crm_emerg("Exiting after signal failure: %s", strerror(errno));
  70             }
  71         } else {
  72 #endif
  73             crm_emerg("Exiting with no known " PCMK__SERVER_PACEMAKERD
  74                       "process");
  75 #if HAVE_LINUX_PROCFS
  76         }
  77 #endif
  78     }
  79     crm_exit(CRM_EX_PANIC);
  80 }
  81 
  82 /*!
  83  * \internal
  84  * \brief Panic the local host (if root) or tell pacemakerd to do so
  85  */
  86 static void
  87 panic_local(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  88 {
  89     const char *full_panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
  90     const char *panic_action = full_panic_action;
  91     int reboot_cmd = RB_AUTOBOOT; // Default panic action is reboot
  92 
  93     if (geteuid() != 0) { // Non-root caller such as the controller
  94         panic_local_nonroot(getppid());
  95         return;
  96     }
  97 
  98     if (pcmk__starts_with(full_panic_action, "sync-")) {
  99         panic_action += sizeof("sync-") - 1;
 100         sync();
 101     }
 102 
 103     if (pcmk__str_empty(full_panic_action)
 104         || pcmk__str_eq(panic_action, PCMK_VALUE_REBOOT, pcmk__str_none)) {
 105         sysrq_trigger('b');
 106 
 107     } else if (pcmk__str_eq(panic_action, PCMK_VALUE_CRASH, pcmk__str_none)) {
 108         sysrq_trigger('c');
 109 
 110     } else if (pcmk__str_eq(panic_action, PCMK_VALUE_OFF, pcmk__str_none)) {
 111         sysrq_trigger('o');
 112 #ifdef RB_POWER_OFF
 113         reboot_cmd = RB_POWER_OFF;
 114 #elif defined(RB_POWEROFF)
 115         reboot_cmd = RB_POWEROFF;
 116 #endif
 117     } else {
 118         crm_warn("Using default '" PCMK_VALUE_REBOOT "' for local option PCMK_"
 119                  PCMK__ENV_PANIC_ACTION " because '%s' is not a valid value",
 120                  full_panic_action);
 121         sysrq_trigger('b');
 122     }
 123 
 124     // sysrq failed or is not supported on this platform, so fall back to reboot
 125     reboot(reboot_cmd);
 126 
 127     // Even reboot failed, nothing left to do but exit
 128     crm_emerg("Exiting after reboot failed: %s", strerror(errno));
 129     if (getppid() > 1) { // pacemakerd is parent process
 130         crm_exit(CRM_EX_PANIC);
 131     } else { // This is pacemakerd, or an orphaned subdaemon
 132         crm_exit(CRM_EX_FATAL);
 133     }
 134 }
 135 
 136 /*!
 137  * \internal
 138  * \brief Tell sbd to kill the local host, then exit
 139  */
 140 static void
 141 panic_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 142 {
 143     union sigval signal_value;
 144     pid_t ppid = getppid();
 145 
 146     memset(&signal_value, 0, sizeof(signal_value));
 147     /* TODO: Arrange for a slightly less brutal option? */
 148     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
 149         crm_emerg("Panicking directly because couldn't signal sbd");
 150         panic_local();
 151     }
 152 
 153     if(ppid > 1) {
 154         /* child daemon */
 155         crm_exit(CRM_EX_PANIC);
 156     } else {
 157         /* pacemakerd or orphan child */
 158         crm_exit(CRM_EX_FATAL);
 159     }
 160 }
 161 
 162 /*!
 163  * \internal
 164  * \brief Panic the local host
 165  *
 166  * Panic the local host either by sbd (if running), directly, or by asking
 167  * pacemakerd. If trace logging this function, exit instead.
 168  *
 169  * \param[in] reason  Why panic is needed (for logging only)
 170  */
 171 void
 172 pcmk__panic(const char *reason)
     /* [previous][next][first][last][top][bottom][index][help] */
 173 {
 174     if (pcmk__locate_sbd() > 1) {
 175         crm_emerg("Signaling sbd[%lld] to panic the system: %s",
 176                   (long long) sbd_pid, reason);
 177         panic_sbd();
 178 
 179     } else {
 180         crm_emerg("Panicking the system directly: %s", reason);
 181         panic_local();
 182     }
 183 }
 184 
 185 /*!
 186  * \internal
 187  * \brief Return the process ID of sbd (or 0 if it is not running)
 188  */
 189 pid_t
 190 pcmk__locate_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 191 {
 192     char *pidfile = NULL;
 193     char *sbd_path = NULL;
 194     int rc;
 195 
 196     if(sbd_pid > 1) {
 197         return sbd_pid;
 198     }
 199 
 200     /* Look for the pid file */
 201     pidfile = crm_strdup_printf(PCMK__RUN_DIR "/sbd.pid");
 202     sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
 203 
 204     /* Read the pid file */
 205     rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
 206     if (rc == pcmk_rc_ok) {
 207         crm_trace("SBD detected at pid %lld (via PID file %s)",
 208                   (long long) sbd_pid, pidfile);
 209 
 210 #if HAVE_LINUX_PROCFS
 211     } else {
 212         /* Fall back to /proc for systems that support it */
 213         sbd_pid = pcmk__procfs_pid_of("sbd");
 214         crm_trace("SBD detected at pid %lld (via procfs)",
 215                   (long long) sbd_pid);
 216 #endif // HAVE_LINUX_PROCFS
 217     }
 218 
 219     if(sbd_pid < 0) {
 220         sbd_pid = 0;
 221         crm_trace("SBD not detected");
 222     }
 223 
 224     free(pidfile);
 225     free(sbd_path);
 226 
 227     return sbd_pid;
 228 }
 229 
 230 long
 231 pcmk__get_sbd_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 232 {
 233     static long sbd_timeout = -2;
 234 
 235     if (sbd_timeout == -2) {
 236         sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
 237     }
 238     return sbd_timeout;
 239 }
 240 
 241 bool
 242 pcmk__get_sbd_sync_resource_startup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 243 {
 244     static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
 245     static bool checked_sync_resource_startup = false;
 246 
 247     if (!checked_sync_resource_startup) {
 248         const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
 249 
 250         if (sync_env == NULL) {
 251             crm_trace("Defaulting to %sstart-up synchronization with sbd",
 252                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
 253 
 254         } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
 255             crm_warn("Defaulting to %sstart-up synchronization with sbd "
 256                      "because environment value '%s' is invalid",
 257                      (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
 258         }
 259         checked_sync_resource_startup = true;
 260     }
 261     return sync_resource_startup != 0;
 262 }
 263 
 264 long
 265 pcmk__auto_stonith_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 266 {
 267     long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
 268 
 269     return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
 270 }
 271 
 272 bool
 273 pcmk__valid_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 274 {
 275     /* @COMPAT At a compatibility break, accept either negative values or a
 276      * specific string like "auto" (but not both) to mean "auto-calculate the
 277      * timeout." Reject other values that aren't parsable as timeouts.
 278      */
 279     long st_timeout = value? crm_get_msec(value) : 0;
 280 
 281     if (st_timeout < 0) {
 282         st_timeout = pcmk__auto_stonith_watchdog_timeout();
 283         crm_debug("Using calculated value %ld for "
 284                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " (%s)",
 285                   st_timeout, value);
 286     }
 287 
 288     if (st_timeout == 0) {
 289         crm_debug("Watchdog may be enabled but "
 290                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
 291                   value? value : "default");
 292 
 293     } else if (pcmk__locate_sbd() == 0) {
 294         crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 295                   " configured (%s) but SBD not active",
 296                   pcmk__s(value, "auto"));
 297         crm_exit(CRM_EX_FATAL);
 298         return false;
 299 
 300     } else {
 301         long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
 302 
 303         if (st_timeout < sbd_timeout) {
 304             crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 305                       " (%s) too short (must be >%ldms)",
 306                       value, sbd_timeout);
 307             crm_exit(CRM_EX_FATAL);
 308             return false;
 309         }
 310         crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 311                  " %s and SBD timeout %ldms",
 312                  value, sbd_timeout);
 313     }
 314     return true;
 315 }

/* [previous][next][first][last][top][bottom][index][help] */