pacemaker  3.0.0-d8340737c4
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2024 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 static pid_t sbd_pid = 0;
24 
31 static void
32 sysrq_trigger(char t)
33 {
34 #if HAVE_LINUX_PROCFS
35  // Root can always write here, regardless of kernel.sysrq value
36  FILE *procf = fopen("/proc/sysrq-trigger", "a");
37 
38  if (procf == NULL) {
39  crm_warn("Could not open sysrq-trigger: %s", strerror(errno));
40  } else {
41  fprintf(procf, "%c\n", t);
42  fclose(procf);
43  }
44 #endif // HAVE_LINUX_PROCFS
45 }
46 
53 static void
54 panic_local_nonroot(pid_t ppid)
55 {
56  if (ppid > 1) { // pacemakerd is still our parent
57  crm_emerg("Escalating panic to " PCMK__SERVER_PACEMAKERD "[%lld]",
58  (long long) ppid);
59  } else { // Signal (non-parent) pacemakerd if possible
60 #if HAVE_LINUX_PROCFS
62  if (ppid > 0) {
63  union sigval signal_value;
64 
65  crm_emerg("Signaling " PCMK__SERVER_PACEMAKERD "[%lld] to panic",
66  (long long) ppid);
67  memset(&signal_value, 0, sizeof(signal_value));
68  if (sigqueue(ppid, SIGQUIT, signal_value) < 0) {
69  crm_emerg("Exiting after signal failure: %s", strerror(errno));
70  }
71  } else {
72 #endif
73  crm_emerg("Exiting with no known " PCMK__SERVER_PACEMAKERD
74  "process");
75 #if HAVE_LINUX_PROCFS
76  }
77 #endif
78  }
80 }
81 
86 static void
87 panic_local(void)
88 {
89  const char *full_panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
90  const char *panic_action = full_panic_action;
91  int reboot_cmd = RB_AUTOBOOT; // Default panic action is reboot
92 
93  if (geteuid() != 0) { // Non-root caller such as the controller
94  panic_local_nonroot(getppid());
95  return;
96  }
97 
98  if (pcmk__starts_with(full_panic_action, "sync-")) {
99  panic_action += sizeof("sync-") - 1;
100  sync();
101  }
102 
103  if (pcmk__str_empty(full_panic_action)
104  || pcmk__str_eq(panic_action, PCMK_VALUE_REBOOT, pcmk__str_none)) {
105  sysrq_trigger('b');
106 
107  } else if (pcmk__str_eq(panic_action, PCMK_VALUE_CRASH, pcmk__str_none)) {
108  sysrq_trigger('c');
109 
110  } else if (pcmk__str_eq(panic_action, PCMK_VALUE_OFF, pcmk__str_none)) {
111  sysrq_trigger('o');
112 #ifdef RB_POWER_OFF
113  reboot_cmd = RB_POWER_OFF;
114 #elif defined(RB_POWEROFF)
115  reboot_cmd = RB_POWEROFF;
116 #endif
117  } else {
118  crm_warn("Using default '" PCMK_VALUE_REBOOT "' for local option PCMK_"
119  PCMK__ENV_PANIC_ACTION " because '%s' is not a valid value",
120  full_panic_action);
121  sysrq_trigger('b');
122  }
123 
124  // sysrq failed or is not supported on this platform, so fall back to reboot
125  reboot(reboot_cmd);
126 
127  // Even reboot failed, nothing left to do but exit
128  crm_emerg("Exiting after reboot failed: %s", strerror(errno));
129  if (getppid() > 1) { // pacemakerd is parent process
131  } else { // This is pacemakerd, or an orphaned subdaemon
133  }
134 }
135 
140 static void
141 panic_sbd(void)
142 {
143  union sigval signal_value;
144  pid_t ppid = getppid();
145 
146  memset(&signal_value, 0, sizeof(signal_value));
147  /* TODO: Arrange for a slightly less brutal option? */
148  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
149  crm_emerg("Panicking directly because couldn't signal sbd");
150  panic_local();
151  }
152 
153  if(ppid > 1) {
154  /* child daemon */
156  } else {
157  /* pacemakerd or orphan child */
159  }
160 }
161 
171 void
172 pcmk__panic(const char *reason)
173 {
174  if (pcmk__locate_sbd() > 1) {
175  crm_emerg("Signaling sbd[%lld] to panic the system: %s",
176  (long long) sbd_pid, reason);
177  panic_sbd();
178 
179  } else {
180  crm_emerg("Panicking the system directly: %s", reason);
181  panic_local();
182  }
183 }
184 
189 pid_t
191 {
192  char *pidfile = NULL;
193  char *sbd_path = NULL;
194  int rc;
195 
196  if(sbd_pid > 1) {
197  return sbd_pid;
198  }
199 
200  /* Look for the pid file */
201  pidfile = crm_strdup_printf(PCMK__RUN_DIR "/sbd.pid");
202  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
203 
204  /* Read the pid file */
205  rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
206  if (rc == pcmk_rc_ok) {
207  crm_trace("SBD detected at pid %lld (via PID file %s)",
208  (long long) sbd_pid, pidfile);
209 
210 #if HAVE_LINUX_PROCFS
211  } else {
212  /* Fall back to /proc for systems that support it */
213  sbd_pid = pcmk__procfs_pid_of("sbd");
214  crm_trace("SBD detected at pid %lld (via procfs)",
215  (long long) sbd_pid);
216 #endif // HAVE_LINUX_PROCFS
217  }
218 
219  if(sbd_pid < 0) {
220  sbd_pid = 0;
221  crm_trace("SBD not detected");
222  }
223 
224  free(pidfile);
225  free(sbd_path);
226 
227  return sbd_pid;
228 }
229 
230 long
232 {
233  static long sbd_timeout = -2;
234 
235  if (sbd_timeout == -2) {
236  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
237  }
238  return sbd_timeout;
239 }
240 
241 bool
243 {
244  static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
245  static bool checked_sync_resource_startup = false;
246 
247  if (!checked_sync_resource_startup) {
248  const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
249 
250  if (sync_env == NULL) {
251  crm_trace("Defaulting to %sstart-up synchronization with sbd",
252  (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
253 
254  } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
255  crm_warn("Defaulting to %sstart-up synchronization with sbd "
256  "because environment value '%s' is invalid",
257  (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
258  }
259  checked_sync_resource_startup = true;
260  }
261  return sync_resource_startup != 0;
262 }
263 
264 long
266 {
267  long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
268 
269  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
270 }
271 
272 bool
274 {
275  /* @COMPAT At a compatibility break, accept either negative values or a
276  * specific string like "auto" (but not both) to mean "auto-calculate the
277  * timeout." Reject other values that aren't parsable as timeouts.
278  */
279  long st_timeout = value? crm_get_msec(value) : 0;
280 
281  if (st_timeout < 0) {
283  crm_debug("Using calculated value %ld for "
285  st_timeout, value);
286  }
287 
288  if (st_timeout == 0) {
289  crm_debug("Watchdog may be enabled but "
290  PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
291  value? value : "default");
292 
293  } else if (pcmk__locate_sbd() == 0) {
295  " configured (%s) but SBD not active",
296  pcmk__s(value, "auto"));
298  return false;
299 
300  } else {
301  long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
302 
303  if (st_timeout < sbd_timeout) {
305  " (%s) too short (must be >%ldms)",
306  value, sbd_timeout);
308  return false;
309  }
310  crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
311  " %s and SBD timeout %ldms",
312  value, sbd_timeout);
313  }
314  return true;
315 }
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:1044
bool pcmk__get_sbd_sync_resource_startup(void)
Definition: watchdog.c:242
Panic the local host.
Definition: results.h:263
const char * pcmk__env_option(const char *option)
Definition: options.c:1075
#define crm_warn(fmt, args...)
Definition: logging.h:362
#define crm_emerg(fmt, args...)
Definition: logging.h:355
#define crm_debug(fmt, args...)
Definition: logging.h:370
#define PCMK__ENV_PANIC_ACTION
#define crm_trace(fmt, args...)
Definition: logging.h:372
long pcmk__get_sbd_watchdog_timeout(void)
Definition: watchdog.c:231
#define PCMK_VALUE_REBOOT
Definition: options.h:199
bool pcmk__valid_stonith_watchdog_timeout(const char *value)
Definition: watchdog.c:273
void pcmk__panic(const char *reason)
Definition: watchdog.c:172
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition: strings.c:351
pid_t pcmk__procfs_pid_of(const char *name)
Definition: procfs.c:107
#define PCMK__SBD_SYNC_DEFAULT
Definition: config.h:526
#define SBIN_DIR
Definition: config.h:535
#define PCMK__RUN_DIR
Definition: config.h:523
pid_t pcmk__locate_sbd(void)
Definition: watchdog.c:190
long pcmk__auto_stonith_watchdog_timeout(void)
Definition: watchdog.c:265
#define PCMK_VALUE_OFF
Definition: options.h:184
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition: pid.c:168
int crm_str_to_boolean(const char *s, int *ret)
Definition: strings.c:498
bool pcmk__starts_with(const char *str, const char *prefix)
Check whether a string starts with a certain sequence.
Definition: strings.c:558
#define PCMK__SERVER_PACEMAKERD
#define PCMK_VALUE_CRASH
Definition: options.h:139
#define crm_info(fmt, args...)
Definition: logging.h:367
Do not respawn.
Definition: results.h:262
#define PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
Definition: options.h:68
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1