pacemaker  2.1.8-3980678f03
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2024 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 static pid_t sbd_pid = 0;
24 
25 static void
26 sysrq_trigger(char t)
27 {
28 #if HAVE_LINUX_PROCFS
29  FILE *procf;
30 
31  // Root can always write here, regardless of kernel.sysrq value
32  procf = fopen("/proc/sysrq-trigger", "a");
33  if (!procf) {
34  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
35  return;
36  }
37  crm_info("sysrq-trigger: %c", t);
38  fprintf(procf, "%c\n", t);
39  fclose(procf);
40 #endif // HAVE_LINUX_PROCFS
41  return;
42 }
43 
44 
49 static void
50 panic_local(void)
51 {
52  int rc = pcmk_ok;
53  uid_t uid = geteuid();
54  pid_t ppid = getppid();
55  const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
56 
57  // Default panic action is to reboot
58  char sysrq = 'b';
59  int reboot_cmd = RB_AUTOBOOT;
60 
61  if(uid != 0 && ppid > 1) {
62  /* We're a non-root pacemaker daemon (pacemaker-based,
63  * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
64  * the original pacemakerd parent.
65  *
66  * Of these, only the controller is likely to be initiating resets.
67  */
68  crm_emerg("Signaling parent %lld to panic", (long long) ppid);
70  return;
71 
72  } else if (uid != 0) {
73 #if HAVE_LINUX_PROCFS
74  /*
75  * No permissions, and no pacemakerd parent to escalate to.
76  * Track down the new pacemakerd process and send a signal instead.
77  */
78  union sigval signal_value;
79 
80  memset(&signal_value, 0, sizeof(signal_value));
81  ppid = pcmk__procfs_pid_of("pacemakerd");
82  crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
83 
84  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
85  crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
86  (long long) ppid);
87  }
88 #endif // HAVE_LINUX_PROCFS
89 
90  /* The best we can do now is die */
92  return;
93  }
94 
95  /* We're either pacemakerd, or a pacemaker daemon running as root */
96 
97  if (pcmk__starts_with(panic_action, "sync-")) {
98  sync();
99  panic_action += strlen("sync-");
100  };
101 
102  if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
103  sysrq = 'c';
104 
105  } else if (pcmk__str_eq(panic_action, "off", pcmk__str_casei)) {
106  sysrq = 'o';
107 #ifdef RB_POWER_OFF
108  reboot_cmd = RB_POWER_OFF;
109 #elif defined(RB_POWEROFF)
110  reboot_cmd = RB_POWEROFF;
111 #endif
112  }
113 
114  sysrq_trigger(sysrq);
115  reboot(reboot_cmd);
116  rc = errno;
117 
118  crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
119  (long long) ppid, pcmk_rc_str(rc), rc);
120 
121  if(ppid > 1) {
122  /* child daemon */
124  } else {
125  /* pacemakerd or orphan child */
127  }
128 }
129 
134 static void
135 panic_sbd(void)
136 {
137  union sigval signal_value;
138  pid_t ppid = getppid();
139 
140  crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
141 
142  memset(&signal_value, 0, sizeof(signal_value));
143  /* TODO: Arrange for a slightly less brutal option? */
144  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
145  crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
146  (long long) sbd_pid);
147  panic_local();
148  }
149 
150  if(ppid > 1) {
151  /* child daemon */
153  } else {
154  /* pacemakerd or orphan child */
156  }
157 }
158 
168 void
169 pcmk__panic(const char *origin)
170 {
171  /* Ensure sbd_pid is set */
172  (void) pcmk__locate_sbd();
173 
175  {
176  // getppid() == 1 means our original parent no longer exists
177  crm_emerg("Shutting down instead of panicking the node "
178  CRM_XS " origin=%s sbd=%lld parent=%d",
179  origin, (long long) sbd_pid, getppid());
181  return;
182  },
183  {}
184  );
185 
186  if(sbd_pid > 1) {
187  crm_emerg("Signaling sbd[%lld] to panic the system: %s",
188  (long long) sbd_pid, origin);
189  panic_sbd();
190 
191  } else {
192  crm_emerg("Panicking the system directly: %s", origin);
193  panic_local();
194  }
195 }
196 
201 pid_t
203 {
204  char *pidfile = NULL;
205  char *sbd_path = NULL;
206  int rc;
207 
208  if(sbd_pid > 1) {
209  return sbd_pid;
210  }
211 
212  /* Look for the pid file */
213  pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
214  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
215 
216  /* Read the pid file */
217  rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
218  if (rc == pcmk_rc_ok) {
219  crm_trace("SBD detected at pid %lld (via PID file %s)",
220  (long long) sbd_pid, pidfile);
221 
222 #if HAVE_LINUX_PROCFS
223  } else {
224  /* Fall back to /proc for systems that support it */
225  sbd_pid = pcmk__procfs_pid_of("sbd");
226  crm_trace("SBD detected at pid %lld (via procfs)",
227  (long long) sbd_pid);
228 #endif // HAVE_LINUX_PROCFS
229  }
230 
231  if(sbd_pid < 0) {
232  sbd_pid = 0;
233  crm_trace("SBD not detected");
234  }
235 
236  free(pidfile);
237  free(sbd_path);
238 
239  return sbd_pid;
240 }
241 
242 long
244 {
245  static long sbd_timeout = -2;
246 
247  if (sbd_timeout == -2) {
248  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
249  }
250  return sbd_timeout;
251 }
252 
253 bool
255 {
256  static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
257  static bool checked_sync_resource_startup = false;
258 
259  if (!checked_sync_resource_startup) {
260  const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
261 
262  if (sync_env == NULL) {
263  crm_trace("Defaulting to %sstart-up synchronization with sbd",
264  (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
265 
266  } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
267  crm_warn("Defaulting to %sstart-up synchronization with sbd "
268  "because environment value '%s' is invalid",
269  (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
270  }
271  checked_sync_resource_startup = true;
272  }
273  return sync_resource_startup != 0;
274 }
275 
276 long
278 {
279  long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
280 
281  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
282 }
283 
284 bool
286 {
287  /* @COMPAT At a compatibility break, accept either negative values or a
288  * specific string like "auto" (but not both) to mean "auto-calculate the
289  * timeout." Reject other values that aren't parsable as timeouts.
290  */
291  long st_timeout = value? crm_get_msec(value) : 0;
292 
293  if (st_timeout < 0) {
295  crm_debug("Using calculated value %ld for "
297  st_timeout, value);
298  }
299 
300  if (st_timeout == 0) {
301  crm_debug("Watchdog may be enabled but "
302  PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
303  value? value : "default");
304 
305  } else if (pcmk__locate_sbd() == 0) {
307  " configured (%s) but SBD not active",
308  pcmk__s(value, "auto"));
310  return false;
311 
312  } else {
313  long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
314 
315  if (st_timeout < sbd_timeout) {
317  " (%s) too short (must be >%ldms)",
318  value, sbd_timeout);
320  return false;
321  }
322  crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
323  " %s and SBD timeout %ldms",
324  value, sbd_timeout);
325  }
326  return true;
327 }
#define pcmk__if_tracing(if_action, else_action)
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:936
void pcmk__panic(const char *origin)
Definition: watchdog.c:169
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition: strings.c:356
bool pcmk__get_sbd_sync_resource_startup(void)
Definition: watchdog.c:254
Panic the local host.
Definition: results.h:287
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition: results.c:501
const char * pcmk__env_option(const char *option)
Definition: options.c:1088
#define crm_warn(fmt, args...)
Definition: logging.h:394
#define crm_emerg(fmt, args...)
Definition: logging.h:387
#define crm_debug(fmt, args...)
Definition: logging.h:402
#define PCMK__ENV_PANIC_ACTION
#define crm_trace(fmt, args...)
Definition: logging.h:404
#define PCMK_RUN_DIR
Definition: config.h:544
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
long pcmk__get_sbd_watchdog_timeout(void)
Definition: watchdog.c:243
bool pcmk__valid_stonith_watchdog_timeout(const char *value)
Definition: watchdog.c:285
pid_t pcmk__procfs_pid_of(const char *name)
Definition: procfs.c:111
#define CRM_XS
Definition: logging.h:56
int crm_str_to_boolean(const char *s, int *ret)
Definition: strings.c:496
#define PCMK__SBD_SYNC_DEFAULT
Definition: config.h:574
#define SBIN_DIR
Definition: config.h:583
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition: logging.h:331
pid_t pcmk__locate_sbd(void)
Definition: watchdog.c:202
long pcmk__auto_stonith_watchdog_timeout(void)
Definition: watchdog.c:277
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition: pid.c:172
#define pcmk_ok
Definition: results.h:69
bool pcmk__starts_with(const char *str, const char *prefix)
Check whether a string starts with a certain sequence.
Definition: strings.c:556
#define crm_info(fmt, args...)
Definition: logging.h:399
Do not respawn.
Definition: results.h:286
#define PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
Definition: options.h:68