root/lib/common/watchdog.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. sysrq_init
  2. sysrq_trigger
  3. pcmk_panic_local
  4. pcmk_panic_sbd
  5. pcmk_panic
  6. pcmk_locate_sbd
  7. crm_get_sbd_timeout
  8. check_sbd_timeout

   1 /*
   2  * Copyright (C) 2013 Lars Marowsky-Bree <lmb@suse.com>
   3  *               2014 Andrew Beekhof <andrew@beekhof.net>
   4  *
   5  * This source code is licensed under the GNU Lesser General Public License
   6  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
   7  */
   8 
   9 #include <crm_internal.h>
  10 
  11 #include <sched.h>
  12 #include <sys/ioctl.h>
  13 #include <sys/reboot.h>
  14 
  15 #include <sys/types.h>
  16 #include <sys/stat.h>
  17 #include <unistd.h>
  18 #include <ctype.h>
  19 #include <dirent.h>
  20 
  21 #ifdef _POSIX_MEMLOCK
  22 #  include <sys/mman.h>
  23 #endif
  24 
  25 static int sbd_pid = 0;
  26 
  27 enum pcmk_panic_flags
  28 {
  29     pcmk_panic_none     = 0x00,
  30     pcmk_panic_delay    = 0x01,
  31     pcmk_panic_kdump    = 0x02,
  32     pcmk_panic_shutdown = 0x04,
  33 };
  34 
  35 #define SYSRQ "/proc/sys/kernel/sysrq"
  36 
  37 void
  38 sysrq_init(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  39 {
  40     static bool need_init = true;
  41     FILE* procf;
  42     int c;
  43 
  44     if(need_init) {
  45         need_init = false;
  46     } else {
  47         return;
  48     }
  49 
  50     procf = fopen(SYSRQ, "r");
  51     if (!procf) {
  52         crm_perror(LOG_ERR, "Cannot open "SYSRQ" for read");
  53         return;
  54     }
  55     if (fscanf(procf, "%d", &c) != 1) {
  56         crm_perror(LOG_ERR, "Parsing "SYSRQ" failed");
  57         c = 0;
  58     }
  59     fclose(procf);
  60     if (c == 1)
  61         return;
  62 
  63     /* 8 for debugging dumps of processes, 128 for reboot/poweroff */
  64     c |= 136;
  65     procf = fopen(SYSRQ, "w");
  66     if (!procf) {
  67         crm_perror(LOG_ERR, "Cannot write to "SYSRQ);
  68         return;
  69     }
  70     fprintf(procf, "%d", c);
  71     fclose(procf);
  72     return;
  73 }
  74 
  75 static void
  76 sysrq_trigger(char t)
     /* [previous][next][first][last][top][bottom][index][help] */
  77 {
  78     FILE *procf;
  79 
  80     sysrq_init();
  81 
  82     procf = fopen("/proc/sysrq-trigger", "a");
  83     if (!procf) {
  84         crm_perror(LOG_ERR, "Opening sysrq-trigger failed");
  85         return;
  86     }
  87     crm_info("sysrq-trigger: %c", t);
  88     fprintf(procf, "%c\n", t);
  89     fclose(procf);
  90     return;
  91 }
  92 
  93 
  94 static void
  95 pcmk_panic_local(void) 
     /* [previous][next][first][last][top][bottom][index][help] */
  96 {
  97     int rc = pcmk_ok;
  98     uid_t uid = geteuid();
  99     pid_t ppid = getppid();
 100 
 101     if(uid != 0 && ppid > 1) {
 102         /* We're a non-root pacemaker daemon (cib, crmd, pengine,
 103          * attrd, etc) with the original pacemakerd parent
 104          *
 105          * Of these, only crmd is likely to be initiating resets
 106          */
 107         do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
 108         crm_exit(pcmk_err_panic);
 109         return;
 110 
 111     } else if (uid != 0) {
 112         /*
 113          * No permissions and no pacemakerd parent to escalate to
 114          * Track down the new pacakerd process and send a signal instead
 115          */
 116         union sigval signal_value;
 117 
 118         memset(&signal_value, 0, sizeof(signal_value));
 119         ppid = crm_procfs_pid_of("pacemakerd");
 120         do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
 121 
 122         if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
 123             crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
 124         }
 125         /* The best we can do now is die */
 126         crm_exit(pcmk_err_panic);
 127         return;
 128     }
 129 
 130     /* We're either pacemakerd, or a pacemaker daemon running as root */
 131 
 132     if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
 133         sysrq_trigger('c');
 134     } else {
 135         sysrq_trigger('b');
 136     }
 137     /* reboot(RB_HALT_SYSTEM); rc = errno; */
 138     reboot(RB_AUTOBOOT);
 139     rc = errno;
 140 
 141     do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
 142 
 143     if(ppid > 1) {
 144         /* child daemon */
 145         exit(pcmk_err_panic);
 146     } else {
 147         /* pacemakerd or orphan child */
 148         exit(DAEMON_RESPAWN_STOP);
 149     }
 150 }
 151 
 152 static void
 153 pcmk_panic_sbd(void) 
     /* [previous][next][first][last][top][bottom][index][help] */
 154 {
 155     union sigval signal_value;
 156     pid_t ppid = getppid();
 157 
 158     do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
 159 
 160     memset(&signal_value, 0, sizeof(signal_value));
 161     /* TODO: Arrange for a slightly less brutal option? */
 162     if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
 163         crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
 164         pcmk_panic_local();
 165     }
 166 
 167     if(ppid > 1) {
 168         /* child daemon */
 169         exit(pcmk_err_panic);
 170     } else {
 171         /* pacemakerd or orphan child */
 172         exit(DAEMON_RESPAWN_STOP);
 173     }
 174 }
 175 
 176 void
 177 pcmk_panic(const char *origin) 
     /* [previous][next][first][last][top][bottom][index][help] */
 178 {
 179     static struct qb_log_callsite *panic_cs = NULL;
 180 
 181     if (panic_cs == NULL) {
 182         panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
 183     }
 184 
 185     /* Ensure sbd_pid is set */
 186     (void)pcmk_locate_sbd();
 187 
 188     if (panic_cs && panic_cs->targets) {
 189         /* getppid() == 1 means our original parent no longer exists */
 190         do_crm_log_always(LOG_EMERG,
 191                           "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d",
 192                           origin, sbd_pid, getppid());
 193         crm_exit(DAEMON_RESPAWN_STOP);
 194         return;
 195     }
 196 
 197     if(sbd_pid > 1) {
 198         do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
 199         pcmk_panic_sbd();
 200 
 201     } else {
 202         do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin);
 203         pcmk_panic_local();
 204     }
 205 }
 206 
 207 pid_t
 208 pcmk_locate_sbd(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 209 {
 210     char *pidfile = NULL;
 211     char *sbd_path = NULL;
 212 
 213     if(sbd_pid > 1) {
 214         return sbd_pid;
 215     }
 216 
 217     /* Look for the pid file */
 218     pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR);
 219     sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
 220 
 221     /* Read the pid file */
 222     CRM_ASSERT(pidfile);
 223 
 224     sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path);
 225     if(sbd_pid > 0) {
 226         crm_trace("SBD detected at pid=%d (file)", sbd_pid);
 227 
 228     } else {
 229         /* Fall back to /proc for systems that support it */
 230         sbd_pid = crm_procfs_pid_of("sbd");
 231         crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
 232     }
 233 
 234     if(sbd_pid < 0) {
 235         sbd_pid = 0;
 236         crm_trace("SBD not detected");
 237     }
 238 
 239     free(pidfile);
 240     free(sbd_path);
 241 
 242     return sbd_pid;
 243 }
 244 
 245 long
 246 crm_get_sbd_timeout(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 247 {
 248     const char *env_value = getenv("SBD_WATCHDOG_TIMEOUT");
 249     long sbd_timeout = crm_get_msec(env_value);
 250 
 251     return sbd_timeout;
 252 }
 253 
 254 gboolean
 255 check_sbd_timeout(const char *value)
     /* [previous][next][first][last][top][bottom][index][help] */
 256 {
 257     long st_timeout = value? crm_get_msec(value) : 0;
 258 
 259     if (st_timeout <= 0) {
 260         crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
 261                   value? value : "default");
 262 
 263     } else if (pcmk_locate_sbd() == 0) {
 264         do_crm_log_always(LOG_EMERG,
 265                           "Shutting down: stonith-watchdog-timeout configured (%s) but SBD not active",
 266                           value);
 267         crm_exit(DAEMON_RESPAWN_STOP);
 268         return FALSE;
 269 
 270     } else {
 271         long sbd_timeout = crm_get_sbd_timeout();
 272 
 273         if (st_timeout < sbd_timeout) {
 274             do_crm_log_always(LOG_EMERG,
 275                               "Shutting down: stonith-watchdog-timeout (%s) too short (must be >%ldms)",
 276                               value, sbd_timeout);
 277             crm_exit(DAEMON_RESPAWN_STOP);
 278             return FALSE;
 279         }
 280         crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
 281                  value, sbd_timeout);
 282     }
 283     return TRUE;
 284 }

/* [previous][next][first][last][top][bottom][index][help] */