pacemaker  2.1.7-0f7f88312f
Scalable High-Availability cluster resource manager
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2023 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 static pid_t sbd_pid = 0;
24 
25 static void
26 sysrq_trigger(char t)
27 {
28 #if HAVE_LINUX_PROCFS
29  FILE *procf;
30 
31  // Root can always write here, regardless of kernel.sysrq value
32  procf = fopen("/proc/sysrq-trigger", "a");
33  if (!procf) {
34  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
35  return;
36  }
37  crm_info("sysrq-trigger: %c", t);
38  fprintf(procf, "%c\n", t);
39  fclose(procf);
40 #endif // HAVE_LINUX_PROCFS
41  return;
42 }
43 
44 
49 static void
50 panic_local(void)
51 {
52  int rc = pcmk_ok;
53  uid_t uid = geteuid();
54  pid_t ppid = getppid();
55  const char *panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
56 
57  if(uid != 0 && ppid > 1) {
58  /* We're a non-root pacemaker daemon (pacemaker-based,
59  * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
60  * the original pacemakerd parent.
61  *
62  * Of these, only the controller is likely to be initiating resets.
63  */
64  crm_emerg("Signaling parent %lld to panic", (long long) ppid);
66  return;
67 
68  } else if (uid != 0) {
69 #if HAVE_LINUX_PROCFS
70  /*
71  * No permissions, and no pacemakerd parent to escalate to.
72  * Track down the new pacemakerd process and send a signal instead.
73  */
74  union sigval signal_value;
75 
76  memset(&signal_value, 0, sizeof(signal_value));
77  ppid = pcmk__procfs_pid_of("pacemakerd");
78  crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
79 
80  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
81  crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
82  (long long) ppid);
83  }
84 #endif // HAVE_LINUX_PROCFS
85 
86  /* The best we can do now is die */
88  return;
89  }
90 
91  /* We're either pacemakerd, or a pacemaker daemon running as root */
92 
93  if (pcmk__str_eq(panic_action, "crash", pcmk__str_casei)) {
94  sysrq_trigger('c');
95 
96  } else if (pcmk__str_eq(panic_action, "sync-crash", pcmk__str_casei)) {
97  sync();
98  sysrq_trigger('c');
99 
100  } else {
101  if (pcmk__str_eq(panic_action, "sync-reboot", pcmk__str_casei)) {
102  sync();
103  }
104  sysrq_trigger('b');
105  }
106  /* reboot(RB_HALT_SYSTEM); rc = errno; */
107  reboot(RB_AUTOBOOT);
108  rc = errno;
109 
110  crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
111  (long long) ppid, pcmk_rc_str(rc), rc);
112 
113  if(ppid > 1) {
114  /* child daemon */
115  exit(CRM_EX_PANIC);
116  } else {
117  /* pacemakerd or orphan child */
118  exit(CRM_EX_FATAL);
119  }
120 }
121 
126 static void
127 panic_sbd(void)
128 {
129  union sigval signal_value;
130  pid_t ppid = getppid();
131 
132  crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
133 
134  memset(&signal_value, 0, sizeof(signal_value));
135  /* TODO: Arrange for a slightly less brutal option? */
136  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
137  crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
138  (long long) sbd_pid);
139  panic_local();
140  }
141 
142  if(ppid > 1) {
143  /* child daemon */
144  exit(CRM_EX_PANIC);
145  } else {
146  /* pacemakerd or orphan child */
147  exit(CRM_EX_FATAL);
148  }
149 }
150 
160 void
161 pcmk__panic(const char *origin)
162 {
163  /* Ensure sbd_pid is set */
164  (void) pcmk__locate_sbd();
165 
167  {
168  // getppid() == 1 means our original parent no longer exists
169  crm_emerg("Shutting down instead of panicking the node "
170  CRM_XS " origin=%s sbd=%lld parent=%d",
171  origin, (long long) sbd_pid, getppid());
173  return;
174  },
175  {}
176  );
177 
178  if(sbd_pid > 1) {
179  crm_emerg("Signaling sbd[%lld] to panic the system: %s",
180  (long long) sbd_pid, origin);
181  panic_sbd();
182 
183  } else {
184  crm_emerg("Panicking the system directly: %s", origin);
185  panic_local();
186  }
187 }
188 
193 pid_t
195 {
196  char *pidfile = NULL;
197  char *sbd_path = NULL;
198  int rc;
199 
200  if(sbd_pid > 1) {
201  return sbd_pid;
202  }
203 
204  /* Look for the pid file */
205  pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
206  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
207 
208  /* Read the pid file */
209  rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
210  if (rc == pcmk_rc_ok) {
211  crm_trace("SBD detected at pid %lld (via PID file %s)",
212  (long long) sbd_pid, pidfile);
213 
214 #if HAVE_LINUX_PROCFS
215  } else {
216  /* Fall back to /proc for systems that support it */
217  sbd_pid = pcmk__procfs_pid_of("sbd");
218  crm_trace("SBD detected at pid %lld (via procfs)",
219  (long long) sbd_pid);
220 #endif // HAVE_LINUX_PROCFS
221  }
222 
223  if(sbd_pid < 0) {
224  sbd_pid = 0;
225  crm_trace("SBD not detected");
226  }
227 
228  free(pidfile);
229  free(sbd_path);
230 
231  return sbd_pid;
232 }
233 
234 long
236 {
237  static long sbd_timeout = -2;
238 
239  if (sbd_timeout == -2) {
240  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
241  }
242  return sbd_timeout;
243 }
244 
245 bool
247 {
248  static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
249  static bool checked_sync_resource_startup = false;
250 
251  if (!checked_sync_resource_startup) {
252  const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
253 
254  if (sync_env == NULL) {
255  crm_trace("Defaulting to %sstart-up synchronization with sbd",
256  (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
257 
258  } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
259  crm_warn("Defaulting to %sstart-up synchronization with sbd "
260  "because environment value '%s' is invalid",
261  (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
262  }
263  checked_sync_resource_startup = true;
264  }
265  return sync_resource_startup != 0;
266 }
267 
268 long
270 {
271  long sbd_timeout = pcmk__get_sbd_timeout();
272 
273  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
274 }
275 
276 bool
277 pcmk__valid_sbd_timeout(const char *value)
278 {
279  long st_timeout = value? crm_get_msec(value) : 0;
280 
281  if (st_timeout < 0) {
282  st_timeout = pcmk__auto_watchdog_timeout();
283  crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
284  st_timeout, value);
285  }
286 
287  if (st_timeout == 0) {
288  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
289  value? value : "default");
290 
291  } else if (pcmk__locate_sbd() == 0) {
292  crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
293  "but SBD not active", (value? value : "auto"));
295  return false;
296 
297  } else {
298  long sbd_timeout = pcmk__get_sbd_timeout();
299 
300  if (st_timeout < sbd_timeout) {
301  crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
302  "(must be >%ldms)", value, sbd_timeout);
304  return false;
305  }
306  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
307  value, sbd_timeout);
308  }
309  return true;
310 }
#define pcmk__if_tracing(if_action, else_action)
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:936
void pcmk__panic(const char *origin)
Definition: watchdog.c:161
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition: strings.c:364
bool pcmk__get_sbd_sync_resource_startup(void)
Definition: watchdog.c:246
Panic the local host.
Definition: results.h:272
bool pcmk__valid_sbd_timeout(const char *value)
Definition: watchdog.c:277
long pcmk__get_sbd_timeout(void)
Definition: watchdog.c:235
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition: results.c:501
const char * pcmk__env_option(const char *option)
Definition: options.c:58
#define crm_warn(fmt, args...)
Definition: logging.h:382
#define crm_emerg(fmt, args...)
Definition: logging.h:379
#define crm_debug(fmt, args...)
Definition: logging.h:386
#define PCMK__ENV_PANIC_ACTION
#define crm_trace(fmt, args...)
Definition: logging.h:387
#define PCMK_RUN_DIR
Definition: config.h:544
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
pid_t pcmk__procfs_pid_of(const char *name)
Definition: procfs.c:111
#define CRM_XS
Definition: logging.h:56
int crm_str_to_boolean(const char *s, int *ret)
Definition: strings.c:424
#define PCMK__SBD_SYNC_DEFAULT
Definition: config.h:571
#define SBIN_DIR
Definition: config.h:577
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition: logging.h:323
pid_t pcmk__locate_sbd(void)
Definition: watchdog.c:194
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition: pid.c:172
long pcmk__auto_watchdog_timeout(void)
Definition: watchdog.c:269
#define pcmk_ok
Definition: results.h:68
#define crm_info(fmt, args...)
Definition: logging.h:384
Do not respawn.
Definition: results.h:271