pacemaker  2.0.2-debe490
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013 Lars Marowsky-Bree <lmb@suse.com>
3  * 2014-2018 Andrew Beekhof <andrew@beekhof.net>
4  *
5  * This source code is licensed under the GNU Lesser General Public License
6  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
7  */
8 
9 #include <crm_internal.h>
10 
11 #include <sched.h>
12 #include <sys/ioctl.h>
13 #include <sys/reboot.h>
14 
15 #include <sys/types.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <ctype.h>
19 #include <dirent.h>
20 #include <signal.h>
21 
22 #ifdef _POSIX_MEMLOCK
23 # include <sys/mman.h>
24 #endif
25 
26 static int sbd_pid = 0;
27 
29 {
34 };
35 
36 static void
37 sysrq_trigger(char t)
38 {
39 #if SUPPORT_PROCFS
40  FILE *procf;
41 
42  // Root can always write here, regardless of kernel.sysrq value
43  procf = fopen("/proc/sysrq-trigger", "a");
44  if (!procf) {
45  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
46  return;
47  }
48  crm_info("sysrq-trigger: %c", t);
49  fprintf(procf, "%c\n", t);
50  fclose(procf);
51 #endif // SUPPORT_PROCFS
52  return;
53 }
54 
55 
56 static void
57 pcmk_panic_local(void)
58 {
59  int rc = pcmk_ok;
60  uid_t uid = geteuid();
61  pid_t ppid = getppid();
62 
63  if(uid != 0 && ppid > 1) {
64  /* We're a non-root pacemaker daemon (pacemaker-based,
65  * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
66  * the original pacemakerd parent.
67  *
68  * Of these, only the controller is likely to be initiating resets.
69  */
70  do_crm_log_always(LOG_EMERG, "Signaling parent %d to panic", ppid);
72  return;
73 
74  } else if (uid != 0) {
75 #if SUPPORT_PROCFS
76  /*
77  * No permissions, and no pacemakerd parent to escalate to.
78  * Track down the new pacemakerd process and send a signal instead.
79  */
80  union sigval signal_value;
81 
82  memset(&signal_value, 0, sizeof(signal_value));
83  ppid = crm_procfs_pid_of("pacemakerd");
84  do_crm_log_always(LOG_EMERG, "Signaling pacemakerd(%d) to panic", ppid);
85 
86  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
87  crm_perror(LOG_EMERG, "Cannot signal pacemakerd(%d) to panic", ppid);
88  }
89 #endif // SUPPORT_PROCFS
90 
91  /* The best we can do now is die */
93  return;
94  }
95 
96  /* We're either pacemakerd, or a pacemaker daemon running as root */
97 
98  if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
99  sysrq_trigger('c');
100  } else {
101  sysrq_trigger('b');
102  }
103  /* reboot(RB_HALT_SYSTEM); rc = errno; */
104  reboot(RB_AUTOBOOT);
105  rc = errno;
106 
107  do_crm_log_always(LOG_EMERG, "Reboot failed, escalating to %d: %s (%d)", ppid, pcmk_strerror(rc), rc);
108 
109  if(ppid > 1) {
110  /* child daemon */
111  exit(CRM_EX_PANIC);
112  } else {
113  /* pacemakerd or orphan child */
114  exit(CRM_EX_FATAL);
115  }
116 }
117 
118 static void
119 pcmk_panic_sbd(void)
120 {
121  union sigval signal_value;
122  pid_t ppid = getppid();
123 
124  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic", sbd_pid);
125 
126  memset(&signal_value, 0, sizeof(signal_value));
127  /* TODO: Arrange for a slightly less brutal option? */
128  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
129  crm_perror(LOG_EMERG, "Cannot signal SBD(%d) to terminate", sbd_pid);
130  pcmk_panic_local();
131  }
132 
133  if(ppid > 1) {
134  /* child daemon */
135  exit(CRM_EX_PANIC);
136  } else {
137  /* pacemakerd or orphan child */
138  exit(CRM_EX_FATAL);
139  }
140 }
141 
142 void
143 pcmk_panic(const char *origin)
144 {
145  static struct qb_log_callsite *panic_cs = NULL;
146 
147  if (panic_cs == NULL) {
148  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
149  }
150 
151  /* Ensure sbd_pid is set */
152  (void)pcmk_locate_sbd();
153 
154  if (panic_cs && panic_cs->targets) {
155  /* getppid() == 1 means our original parent no longer exists */
156  do_crm_log_always(LOG_EMERG,
157  "Shutting down instead of panicking the node: origin=%s, sbd=%d, parent=%d",
158  origin, sbd_pid, getppid());
160  return;
161  }
162 
163  if(sbd_pid > 1) {
164  do_crm_log_always(LOG_EMERG, "Signaling sbd(%d) to panic the system: %s", sbd_pid, origin);
165  pcmk_panic_sbd();
166 
167  } else {
168  do_crm_log_always(LOG_EMERG, "Panicking the system directly: %s", origin);
169  pcmk_panic_local();
170  }
171 }
172 
173 pid_t
175 {
176  char *pidfile = NULL;
177  char *sbd_path = NULL;
178 
179  if(sbd_pid > 1) {
180  return sbd_pid;
181  }
182 
183  /* Look for the pid file */
184  pidfile = crm_strdup_printf("%s/sbd.pid", HA_STATE_DIR);
185  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
186 
187  /* Read the pid file */
188  CRM_ASSERT(pidfile);
189 
190  sbd_pid = crm_pidfile_inuse(pidfile, 0, sbd_path);
191  if(sbd_pid > 0) {
192  crm_trace("SBD detected at pid=%d (file)", sbd_pid);
193 
194 #if SUPPORT_PROCFS
195  } else {
196  /* Fall back to /proc for systems that support it */
197  sbd_pid = crm_procfs_pid_of("sbd");
198  crm_trace("SBD detected at pid=%d (proc)", sbd_pid);
199 #endif // SUPPORT_PROCFS
200  }
201 
202  if(sbd_pid < 0) {
203  sbd_pid = 0;
204  crm_trace("SBD not detected");
205  }
206 
207  free(pidfile);
208  free(sbd_path);
209 
210  return sbd_pid;
211 }
212 
213 long
215 {
216  static long sbd_timeout = -2;
217 
218  if (sbd_timeout == -2) {
219  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
220  }
221  return sbd_timeout;
222 }
223 
224 long
226 {
227  long sbd_timeout = crm_get_sbd_timeout();
228 
229  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
230 }
231 
232 gboolean
233 check_sbd_timeout(const char *value)
234 {
235  long st_timeout = value? crm_get_msec(value) : 0;
236 
237  if (st_timeout < 0) {
238  st_timeout = crm_auto_watchdog_timeout();
239  crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
240  st_timeout, value);
241  }
242 
243  if (st_timeout == 0) {
244  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
245  value? value : "default");
246 
247  } else if (pcmk_locate_sbd() == 0) {
248  do_crm_log_always(LOG_EMERG,
249  "Shutting down: stonith-watchdog-timeout configured (%s) but SBD not active",
250  (value? value : "auto"));
252  return FALSE;
253 
254  } else {
255  long sbd_timeout = crm_get_sbd_timeout();
256 
257  if (st_timeout < sbd_timeout) {
258  do_crm_log_always(LOG_EMERG,
259  "Shutting down: stonith-watchdog-timeout (%s) too short (must be >%ldms)",
260  value, sbd_timeout);
262  return FALSE;
263  }
264  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
265  value, sbd_timeout);
266  }
267  return TRUE;
268 }
#define LOG_TRACE
Definition: logging.h:26
const char * pcmk_strerror(int rc)
Definition: results.c:188
long crm_get_sbd_timeout(void)
Definition: watchdog.c:214
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:476
gboolean check_sbd_timeout(const char *value)
Definition: watchdog.c:233
long long crm_get_msec(const char *input)
Definition: utils.c:567
unsigned int crm_trace_nonlog
Definition: logging.c:39
pcmk_panic_flags
Definition: watchdog.c:28
long crm_auto_watchdog_timeout(void)
Definition: watchdog.c:225
#define crm_debug(fmt, args...)
Definition: logging.h:245
#define crm_trace(fmt, args...)
Definition: logging.h:246
void pcmk_panic(const char *origin)
Definition: watchdog.c:143
#define HA_STATE_DIR
Definition: config.h:455
long crm_pidfile_inuse(const char *filename, long mypid, const char *daemon)
Definition: pid.c:141
#define do_crm_log_always(level, fmt, args...)
Log a message using constant severity.
Definition: logging.h:205
#define SBIN_DIR
Definition: config.h:550
#define crm_perror(level, fmt, args...)
Log a system error message.
Definition: logging.h:218
#define CRM_ASSERT(expr)
Definition: results.h:42
#define pcmk_ok
Definition: results.h:57
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:174
#define safe_str_eq(a, b)
Definition: util.h:59
int crm_procfs_pid_of(const char *name)
Definition: procfs.c:110
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
#define crm_info(fmt, args...)
Definition: logging.h:243