root/lib/common/watchdog.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sysrq_trigger
  2. panic_local
  3. panic_sbd
  4. pcmk__panic
  5. pcmk__locate_sbd
  6. pcmk__get_sbd_timeout
  7. pcmk__get_sbd_sync_resource_startup
  8. pcmk__auto_watchdog_timeout
  9. pcmk__valid_sbd_timeout

   1 /*
   2  * Copyright 2013-2023 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU Lesser General Public License
   7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sched.h>
  13 #include <sys/ioctl.h>
  14 #include <sys/reboot.h>
  15 
  16 #include <sys/types.h>
  17 #include <sys/stat.h>
  18 #include <unistd.h>
  19 #include <ctype.h>
  20 #include <dirent.h>
  21 #include <signal.h>
  22 
  23 static pid_t sbd_pid = 0;
  24 
  25 static void
  26 sysrq_trigger(char t)
     /* [previous][next][first][last][top][bottom][index][help] */
  27 {
  28 #if HAVE_LINUX_PROCFS
  29     FILE *procf;
  30 
  31     // Root can always write here, regardless of kernel.sysrq value
  32     procf = fopen("/proc/sysrq-trigger", "a");
  33     if (!procf) {
  34         crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
  35         return;
  36     }
  37     crm_info("sysrq-trigger: %c", t);
  38     fprintf(procf, "%c\n", t);
  39     fclose(procf);
  40 #endif // HAVE_LINUX_PROCFS
  41     return;
  42 }
  43 
  44 
  45 /*!
  46  * \internal
  47  * \brief Panic the local host (if root) or tell pacemakerd to do so
  48  */
  49 static void
  50 panic_local(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  51 {
  52     int rc = pcmk_ok;
  53     uid_t uid = geteuid();
  54     pid_t ppid = getppid();
  55     const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
  56 
  57     if(uid != 0 && ppid > 1) {
  58         /* We're a non-root pacemaker daemon (pacemaker-based,
  59          * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
  60          * the original pacemakerd parent.
  61          *
  62          * Of these, only the controller is likely to be initiating resets.
  63          */
  64         crm_emerg("Signaling parent %lld to panic", (long long) ppid);
  65         crm_exit(CRM_EX_PANIC);
  66         return;
  67 
  68     } else if (uid != 0) {
  69 #if HAVE_LINUX_PROCFS
  70         /*
  71          * No permissions, and no pacemakerd parent to escalate to.
  72          * Track down the new pacemakerd process and send a signal instead.
  73          */
  74         union sigval signal_value;
  75 
  76         memset(&signal_value, 0, sizeof(signal_value));
  77         ppid = pcmk__procfs_pid_of("pacemakerd");
  78         crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
  79 
  80         if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
  81             crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
  82                        (long long) ppid);
  83         }
  84 #endif // HAVE_LINUX_PROCFS
  85 
  86         /* The best we can do now is die */
  87         crm_exit(CRM_EX_PANIC);
  88         return;
  89     }
  90 
  91     /* We're either pacemakerd, or a pacemaker daemon running as root */
  92 
  93     if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
  94         sysrq_trigger('c');
  95 
  96     } else if (pcmk__str_eq(panic_action, "sync-crash", pcmk__str_casei)) {
  97         sync();
  98         sysrq_trigger('c');
  99 
 100     } else {
 101         if (pcmk__str_eq(panic_action, "sync-reboot", pcmk__str_casei)) {
 102             sync();
 103         }
 104         sysrq_trigger('b');
 105     }
 106     /* reboot(RB_HALT_SYSTEM); rc = errno; */
 107     reboot(RB_AUTOBOOT);
 108     rc = errno;
 109 
 110     crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
 111               (long long) ppid, pcmk_rc_str(rc), rc);
 112 
 113     if(ppid > 1) {
 114         /* child daemon */
 115         exit(CRM_EX_PANIC);
 116     } else {
 117         /* pacemakerd or orphan child */
 118         exit(CRM_EX_FATAL);
 119     }
 120 }
 121 
 122 /*!
 123  * \internal
 124  * \brief Tell sbd to kill the local host, then exit
 125  */
 126 static void
 127 panic_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 128 {
 129     union sigval signal_value;
 130     pid_t ppid = getppid();
 131 
 132     crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
 133 
 134     memset(&signal_value, 0, sizeof(signal_value));
 135     /* TODO: Arrange for a slightly less brutal option? */
 136     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
 137         crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
 138                    (long long) sbd_pid);
 139         panic_local();
 140     }
 141 
 142     if(ppid > 1) {
 143         /* child daemon */
 144         exit(CRM_EX_PANIC);
 145     } else {
 146         /* pacemakerd or orphan child */
 147         exit(CRM_EX_FATAL);
 148     }
 149 }
 150 
 151 /*!
 152  * \internal
 153  * \brief Panic the local host
 154  *
 155  * Panic the local host either by sbd (if running), directly, or by asking
 156  * pacemakerd. If trace logging this function, exit instead.
 157  *
 158  * \param[in] origin   Function caller (for logging only)
 159  */
 160 void
 161 pcmk__panic(const char *origin)
     /* [previous][next][first][last][top][bottom][index][help] */
 162 {
 163     /* Ensure sbd_pid is set */
 164     (void) pcmk__locate_sbd();
 165 
 166     pcmk__if_tracing(
 167         {
 168             // getppid() == 1 means our original parent no longer exists
 169             crm_emerg("Shutting down instead of panicking the node "
 170                       CRM_XS " origin=%s sbd=%lld parent=%d",
 171                       origin, (long long) sbd_pid, getppid());
 172             crm_exit(CRM_EX_FATAL);
 173             return;
 174         },
 175         {}
 176     );
 177 
 178     if(sbd_pid > 1) {
 179         crm_emerg("Signaling sbd[%lld] to panic the system: %s",
 180                   (long long) sbd_pid, origin);
 181         panic_sbd();
 182 
 183     } else {
 184         crm_emerg("Panicking the system directly: %s", origin);
 185         panic_local();
 186     }
 187 }
 188 
 189 /*!
 190  * \internal
 191  * \brief Return the process ID of sbd (or 0 if it is not running)
 192  */
 193 pid_t
 194 pcmk__locate_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 195 {
 196     char *pidfile = NULL;
 197     char *sbd_path = NULL;
 198     int rc;
 199 
 200     if(sbd_pid > 1) {
 201         return sbd_pid;
 202     }
 203 
 204     /* Look for the pid file */
 205     pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
 206     sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
 207 
 208     /* Read the pid file */
 209     rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
 210     if (rc == pcmk_rc_ok) {
 211         crm_trace("SBD detected at pid %lld (via PID file %s)",
 212                   (long long) sbd_pid, pidfile);
 213 
 214 #if HAVE_LINUX_PROCFS
 215     } else {
 216         /* Fall back to /proc for systems that support it */
 217         sbd_pid = pcmk__procfs_pid_of("sbd");
 218         crm_trace("SBD detected at pid %lld (via procfs)",
 219                   (long long) sbd_pid);
 220 #endif // HAVE_LINUX_PROCFS
 221     }
 222 
 223     if(sbd_pid < 0) {
 224         sbd_pid = 0;
 225         crm_trace("SBD not detected");
 226     }
 227 
 228     free(pidfile);
 229     free(sbd_path);
 230 
 231     return sbd_pid;
 232 }
 233 
 234 long
 235 pcmk__get_sbd_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 236 {
 237     static long sbd_timeout = -2;
 238 
 239     if (sbd_timeout == -2) {
 240         sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
 241     }
 242     return sbd_timeout;
 243 }
 244 
 245 bool
 246 pcmk__get_sbd_sync_resource_startup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 247 {
 248     static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
 249     static bool checked_sync_resource_startup = false;
 250 
 251     if (!checked_sync_resource_startup) {
 252         const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
 253 
 254         if (sync_env == NULL) {
 255             crm_trace("Defaulting to %sstart-up synchronization with sbd",
 256                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
 257 
 258         } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
 259             crm_warn("Defaulting to %sstart-up synchronization with sbd "
 260                      "because environment value '%s' is invalid",
 261                      (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
 262         }
 263         checked_sync_resource_startup = true;
 264     }
 265     return sync_resource_startup != 0;
 266 }
 267 
 268 long
 269 pcmk__auto_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 270 {
 271     long sbd_timeout = pcmk__get_sbd_timeout();
 272 
 273     return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
 274 }
 275 
 276 bool
 277 pcmk__valid_sbd_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 278 {
 279     long st_timeout = value? crm_get_msec(value) : 0;
 280 
 281     if (st_timeout < 0) {
 282         st_timeout = pcmk__auto_watchdog_timeout();
 283         crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
 284                   st_timeout, value);
 285     }
 286 
 287     if (st_timeout == 0) {
 288         crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
 289                   value? value : "default");
 290 
 291     } else if (pcmk__locate_sbd() == 0) {
 292         crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
 293                   "but SBD not active", (value? value : "auto"));
 294         crm_exit(CRM_EX_FATAL);
 295         return false;
 296 
 297     } else {
 298         long sbd_timeout = pcmk__get_sbd_timeout();
 299 
 300         if (st_timeout < sbd_timeout) {
 301             crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
 302                       "(must be >%ldms)", value, sbd_timeout);
 303             crm_exit(CRM_EX_FATAL);
 304             return false;
 305         }
 306         crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
 307                  value, sbd_timeout);
 308     }
 309     return true;
 310 }

/* [previous][next][first][last][top][bottom][index][help] */