root/lib/common/watchdog.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sysrq_trigger
  2. panic_local
  3. panic_sbd
  4. pcmk__panic
  5. pcmk__locate_sbd
  6. pcmk__get_sbd_watchdog_timeout
  7. pcmk__get_sbd_sync_resource_startup
  8. pcmk__auto_stonith_watchdog_timeout
  9. pcmk__valid_stonith_watchdog_timeout

   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU Lesser General Public License
   7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sched.h>
  13 #include <sys/ioctl.h>
  14 #include <sys/reboot.h>
  15 
  16 #include <sys/types.h>
  17 #include <sys/stat.h>
  18 #include <unistd.h>
  19 #include <ctype.h>
  20 #include <dirent.h>
  21 #include <signal.h>
  22 
  23 static pid_t sbd_pid = 0;
  24 
  25 static void
  26 sysrq_trigger(char t)
     /* [previous][next][first][last][top][bottom][index][help] */
  27 {
  28 #if HAVE_LINUX_PROCFS
  29     FILE *procf;
  30 
  31     // Root can always write here, regardless of kernel.sysrq value
  32     procf = fopen("/proc/sysrq-trigger", "a");
  33     if (!procf) {
  34         crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
  35         return;
  36     }
  37     crm_info("sysrq-trigger: %c", t);
  38     fprintf(procf, "%c\n", t);
  39     fclose(procf);
  40 #endif // HAVE_LINUX_PROCFS
  41     return;
  42 }
  43 
  44 
  45 /*!
  46  * \internal
  47  * \brief Panic the local host (if root) or tell pacemakerd to do so
  48  */
  49 static void
  50 panic_local(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  51 {
  52     int rc = pcmk_ok;
  53     uid_t uid = geteuid();
  54     pid_t ppid = getppid();
  55     const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
  56 
  57     // Default panic action is to reboot
  58     char sysrq = 'b';
  59     int reboot_cmd = RB_AUTOBOOT;
  60 
  61     if(uid != 0 && ppid > 1) {
  62         /* We're a non-root pacemaker daemon (pacemaker-based,
  63          * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
  64          * the original pacemakerd parent.
  65          *
  66          * Of these, only the controller is likely to be initiating resets.
  67          */
  68         crm_emerg("Signaling parent %lld to panic", (long long) ppid);
  69         crm_exit(CRM_EX_PANIC);
  70         return;
  71 
  72     } else if (uid != 0) {
  73 #if HAVE_LINUX_PROCFS
  74         /*
  75          * No permissions, and no pacemakerd parent to escalate to.
  76          * Track down the new pacemakerd process and send a signal instead.
  77          */
  78         union sigval signal_value;
  79 
  80         memset(&signal_value, 0, sizeof(signal_value));
  81         ppid = pcmk__procfs_pid_of("pacemakerd");
  82         crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
  83 
  84         if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
  85             crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
  86                        (long long) ppid);
  87         }
  88 #endif // HAVE_LINUX_PROCFS
  89 
  90         /* The best we can do now is die */
  91         crm_exit(CRM_EX_PANIC);
  92         return;
  93     }
  94 
  95     /* We're either pacemakerd, or a pacemaker daemon running as root */
  96 
  97     if (pcmk__starts_with(panic_action, "sync-")) {
  98         sync();
  99         panic_action += strlen("sync-");
 100     };
 101 
 102     if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
 103         sysrq = 'c';
 104 
 105     } else if (pcmk__str_eq(panic_action, "off", pcmk__str_casei)) {
 106         sysrq = 'o';
 107 #ifdef RB_POWER_OFF
 108         reboot_cmd = RB_POWER_OFF;
 109 #elif defined(RB_POWEROFF)
 110         reboot_cmd = RB_POWEROFF;
 111 #endif
 112     }
 113 
 114     sysrq_trigger(sysrq);
 115     reboot(reboot_cmd);
 116     rc = errno;
 117 
 118     crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
 119               (long long) ppid, pcmk_rc_str(rc), rc);
 120 
 121     if(ppid > 1) {
 122         /* child daemon */
 123         crm_exit(CRM_EX_PANIC);
 124     } else {
 125         /* pacemakerd or orphan child */
 126         crm_exit(CRM_EX_FATAL);
 127     }
 128 }
 129 
 130 /*!
 131  * \internal
 132  * \brief Tell sbd to kill the local host, then exit
 133  */
 134 static void
 135 panic_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 136 {
 137     union sigval signal_value;
 138     pid_t ppid = getppid();
 139 
 140     crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
 141 
 142     memset(&signal_value, 0, sizeof(signal_value));
 143     /* TODO: Arrange for a slightly less brutal option? */
 144     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
 145         crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
 146                    (long long) sbd_pid);
 147         panic_local();
 148     }
 149 
 150     if(ppid > 1) {
 151         /* child daemon */
 152         crm_exit(CRM_EX_PANIC);
 153     } else {
 154         /* pacemakerd or orphan child */
 155         crm_exit(CRM_EX_FATAL);
 156     }
 157 }
 158 
 159 /*!
 160  * \internal
 161  * \brief Panic the local host
 162  *
 163  * Panic the local host either by sbd (if running), directly, or by asking
 164  * pacemakerd. If trace logging this function, exit instead.
 165  *
 166  * \param[in] origin   Function caller (for logging only)
 167  */
 168 void
 169 pcmk__panic(const char *origin)
     /* [previous][next][first][last][top][bottom][index][help] */
 170 {
 171     /* Ensure sbd_pid is set */
 172     (void) pcmk__locate_sbd();
 173 
 174     pcmk__if_tracing(
 175         {
 176             // getppid() == 1 means our original parent no longer exists
 177             crm_emerg("Shutting down instead of panicking the node "
 178                       CRM_XS " origin=%s sbd=%lld parent=%d",
 179                       origin, (long long) sbd_pid, getppid());
 180             crm_exit(CRM_EX_FATAL);
 181             return;
 182         },
 183         {}
 184     );
 185 
 186     if(sbd_pid > 1) {
 187         crm_emerg("Signaling sbd[%lld] to panic the system: %s",
 188                   (long long) sbd_pid, origin);
 189         panic_sbd();
 190 
 191     } else {
 192         crm_emerg("Panicking the system directly: %s", origin);
 193         panic_local();
 194     }
 195 }
 196 
 197 /*!
 198  * \internal
 199  * \brief Return the process ID of sbd (or 0 if it is not running)
 200  */
 201 pid_t
 202 pcmk__locate_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 203 {
 204     char *pidfile = NULL;
 205     char *sbd_path = NULL;
 206     int rc;
 207 
 208     if(sbd_pid > 1) {
 209         return sbd_pid;
 210     }
 211 
 212     /* Look for the pid file */
 213     pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
 214     sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
 215 
 216     /* Read the pid file */
 217     rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
 218     if (rc == pcmk_rc_ok) {
 219         crm_trace("SBD detected at pid %lld (via PID file %s)",
 220                   (long long) sbd_pid, pidfile);
 221 
 222 #if HAVE_LINUX_PROCFS
 223     } else {
 224         /* Fall back to /proc for systems that support it */
 225         sbd_pid = pcmk__procfs_pid_of("sbd");
 226         crm_trace("SBD detected at pid %lld (via procfs)",
 227                   (long long) sbd_pid);
 228 #endif // HAVE_LINUX_PROCFS
 229     }
 230 
 231     if(sbd_pid < 0) {
 232         sbd_pid = 0;
 233         crm_trace("SBD not detected");
 234     }
 235 
 236     free(pidfile);
 237     free(sbd_path);
 238 
 239     return sbd_pid;
 240 }
 241 
 242 long
 243 pcmk__get_sbd_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 244 {
 245     static long sbd_timeout = -2;
 246 
 247     if (sbd_timeout == -2) {
 248         sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
 249     }
 250     return sbd_timeout;
 251 }
 252 
 253 bool
 254 pcmk__get_sbd_sync_resource_startup(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 255 {
 256     static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
 257     static bool checked_sync_resource_startup = false;
 258 
 259     if (!checked_sync_resource_startup) {
 260         const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
 261 
 262         if (sync_env == NULL) {
 263             crm_trace("Defaulting to %sstart-up synchronization with sbd",
 264                       (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
 265 
 266         } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
 267             crm_warn("Defaulting to %sstart-up synchronization with sbd "
 268                      "because environment value '%s' is invalid",
 269                      (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
 270         }
 271         checked_sync_resource_startup = true;
 272     }
 273     return sync_resource_startup != 0;
 274 }
 275 
 276 long
 277 pcmk__auto_stonith_watchdog_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 278 {
 279     long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
 280 
 281     return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
 282 }
 283 
 284 bool
 285 pcmk__valid_stonith_watchdog_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 286 {
 287     /* @COMPAT At a compatibility break, accept either negative values or a
 288      * specific string like "auto" (but not both) to mean "auto-calculate the
 289      * timeout." Reject other values that aren't parsable as timeouts.
 290      */
 291     long st_timeout = value? crm_get_msec(value) : 0;
 292 
 293     if (st_timeout < 0) {
 294         st_timeout = pcmk__auto_stonith_watchdog_timeout();
 295         crm_debug("Using calculated value %ld for "
 296                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " (%s)",
 297                   st_timeout, value);
 298     }
 299 
 300     if (st_timeout == 0) {
 301         crm_debug("Watchdog may be enabled but "
 302                   PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
 303                   value? value : "default");
 304 
 305     } else if (pcmk__locate_sbd() == 0) {
 306         crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 307                   " configured (%s) but SBD not active",
 308                   pcmk__s(value, "auto"));
 309         crm_exit(CRM_EX_FATAL);
 310         return false;
 311 
 312     } else {
 313         long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
 314 
 315         if (st_timeout < sbd_timeout) {
 316             crm_emerg("Shutting down: " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 317                       " (%s) too short (must be >%ldms)",
 318                       value, sbd_timeout);
 319             crm_exit(CRM_EX_FATAL);
 320             return false;
 321         }
 322         crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
 323                  " %s and SBD timeout %ldms",
 324                  value, sbd_timeout);
 325     }
 326     return true;
 327 }

/* [previous][next][first][last][top][bottom][index][help] */