pacemaker 3.0.1-16e74fc4da
Scalable High-Availability cluster resource manager
Loading...
Searching...
No Matches
watchdog.c
Go to the documentation of this file.
1/*
2 * Copyright 2013-2024 the Pacemaker project contributors
3 *
4 * The version control history for this file may have further details.
5 *
6 * This source code is licensed under the GNU Lesser General Public License
7 * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8 */
9
10#include <crm_internal.h>
11
12#include <sched.h>
13#include <sys/ioctl.h>
14#include <sys/reboot.h>
15
16#include <sys/types.h>
17#include <sys/stat.h>
18#include <unistd.h>
19#include <ctype.h>
20#include <dirent.h>
21#include <signal.h>
22
23static pid_t sbd_pid = 0;
24
31static void
32panic_local_nonroot(pid_t ppid)
33{
34 if (ppid > 1) { // pacemakerd is still our parent
35 crm_emerg("Escalating panic to " PCMK__SERVER_PACEMAKERD "[%lld]",
36 (long long) ppid);
37 } else { // Signal (non-parent) pacemakerd if possible
39 if (ppid > 0) {
40 union sigval signal_value;
41
42 crm_emerg("Signaling " PCMK__SERVER_PACEMAKERD "[%lld] to panic",
43 (long long) ppid);
44 memset(&signal_value, 0, sizeof(signal_value));
45 if (sigqueue(ppid, SIGQUIT, signal_value) < 0) {
46 crm_emerg("Exiting after signal failure: %s", strerror(errno));
47 }
48 } else {
49 crm_emerg("Exiting with no known " PCMK__SERVER_PACEMAKERD
50 "process");
51 }
52 }
54}
55
60static void
61panic_local(void)
62{
63 const char *full_panic_action = pcmk__env_option(PCMK__ENV_PANIC_ACTION);
64 const char *panic_action = full_panic_action;
65 int reboot_cmd = RB_AUTOBOOT; // Default panic action is reboot
66
67 if (geteuid() != 0) { // Non-root caller such as the controller
68 panic_local_nonroot(getppid());
69 return;
70 }
71
72 if (pcmk__starts_with(full_panic_action, "sync-")) {
73 panic_action += sizeof("sync-") - 1;
74 sync();
75 }
76
77 if (pcmk__str_empty(full_panic_action)
78 || pcmk__str_eq(panic_action, PCMK_VALUE_REBOOT, pcmk__str_none)) {
80
81 } else if (pcmk__str_eq(panic_action, PCMK_VALUE_CRASH, pcmk__str_none)) {
83
84 } else if (pcmk__str_eq(panic_action, PCMK_VALUE_OFF, pcmk__str_none)) {
86#ifdef RB_POWER_OFF
87 reboot_cmd = RB_POWER_OFF;
88#elif defined(RB_POWEROFF)
89 reboot_cmd = RB_POWEROFF;
90#endif
91 } else {
92 crm_warn("Using default '" PCMK_VALUE_REBOOT "' for local option PCMK_"
93 PCMK__ENV_PANIC_ACTION " because '%s' is not a valid value",
94 full_panic_action);
96 }
97
98 // sysrq failed or is not supported on this platform, so fall back to reboot
99 reboot(reboot_cmd);
100
101 // Even reboot failed, nothing left to do but exit
102 crm_emerg("Exiting after reboot failed: %s", strerror(errno));
103 if (getppid() > 1) { // pacemakerd is parent process
105 } else { // This is pacemakerd, or an orphaned subdaemon
107 }
108}
109
114static void
115panic_sbd(void)
116{
117 union sigval signal_value;
118 pid_t ppid = getppid();
119
120 memset(&signal_value, 0, sizeof(signal_value));
121 /* TODO: Arrange for a slightly less brutal option? */
122 if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
123 crm_emerg("Panicking directly because couldn't signal sbd");
124 panic_local();
125 }
126
127 if(ppid > 1) {
128 /* child daemon */
130 } else {
131 /* pacemakerd or orphan child */
133 }
134}
135
145void
146pcmk__panic(const char *reason)
147{
148 if (pcmk__locate_sbd() > 1) {
149 crm_emerg("Signaling sbd[%lld] to panic the system: %s",
150 (long long) sbd_pid, reason);
151 panic_sbd();
152
153 } else {
154 crm_emerg("Panicking the system directly: %s", reason);
155 panic_local();
156 }
157}
158
163pid_t
165{
166 const char *pidfile = PCMK__RUN_DIR "/sbd.pid";
167 int rc;
168
169 if(sbd_pid > 1) {
170 return sbd_pid;
171 }
172
173 /* Read the pid file */
174 rc = pcmk__pidfile_matches(pidfile, 0, SBIN_DIR "/sbd", &sbd_pid);
175 if (rc == pcmk_rc_ok) {
176 crm_trace("SBD detected at pid %lld (via PID file %s)",
177 (long long) sbd_pid, pidfile);
178 } else {
179 /* Fall back to /proc for systems that support it */
180 sbd_pid = pcmk__procfs_pid_of("sbd");
181
182 if (sbd_pid != 0) {
183 crm_trace("SBD detected at pid %lld (via procfs)",
184 (long long) sbd_pid);
185 }
186 }
187
188 if(sbd_pid < 0) {
189 sbd_pid = 0;
190 crm_trace("SBD not detected");
191 }
192
193 return sbd_pid;
194}
195
196long
198{
199 static long sbd_timeout = -2;
200
201 if (sbd_timeout == -2) {
202 sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
203 }
204 return sbd_timeout;
205}
206
207bool
209{
210 static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
211 static bool checked_sync_resource_startup = false;
212
213 if (!checked_sync_resource_startup) {
214 const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
215
216 if (sync_env == NULL) {
217 crm_trace("Defaulting to %sstart-up synchronization with sbd",
218 (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
219
220 } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
221 crm_warn("Defaulting to %sstart-up synchronization with sbd "
222 "because environment value '%s' is invalid",
223 (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
224 }
225 checked_sync_resource_startup = true;
226 }
227 return sync_resource_startup != 0;
228}
229
230long
232{
233 long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
234
235 return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
236}
237
238bool
240{
241 /* @COMPAT At a compatibility break, accept either negative values or a
242 * specific string like "auto" (but not both) to mean "auto-calculate the
243 * timeout." Reject other values that aren't parsable as timeouts.
244 */
245 long st_timeout = value? crm_get_msec(value) : 0;
246
247 if (st_timeout < 0) {
249 crm_debug("Using calculated value %ld for "
251 st_timeout, value);
252 }
253
254 if (st_timeout == 0) {
255 crm_debug("Watchdog may be enabled but "
256 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " is disabled (%s)",
257 value? value : "default");
258
259 } else if (pcmk__locate_sbd() == 0) {
261 " configured (%s) but SBD not active",
262 pcmk__s(value, "auto"));
264 return false;
265
266 } else {
267 long sbd_timeout = pcmk__get_sbd_watchdog_timeout();
268
269 if (st_timeout < sbd_timeout) {
271 " (%s) too short (must be >%ldms)",
272 value, sbd_timeout);
274 return false;
275 }
276 crm_info("Watchdog configured with " PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
277 " %s and SBD timeout %ldms",
278 value, sbd_timeout);
279 }
280 return true;
281}
void pcmk__sysrq_trigger(char t)
Definition procfs.c:282
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition pid.c:168
pid_t pcmk__procfs_pid_of(const char *name)
Definition procfs.c:127
#define SBIN_DIR
Definition config.h:526
#define PCMK__SBD_SYNC_DEFAULT
Definition config.h:517
#define PCMK__RUN_DIR
Definition config.h:514
#define crm_info(fmt, args...)
Definition logging.h:365
#define crm_warn(fmt, args...)
Definition logging.h:360
#define crm_debug(fmt, args...)
Definition logging.h:368
#define crm_trace(fmt, args...)
Definition logging.h:370
#define crm_emerg(fmt, args...)
Definition logging.h:353
#define PCMK_VALUE_OFF
Definition options.h:185
#define PCMK_VALUE_REBOOT
Definition options.h:200
#define PCMK_VALUE_CRASH
Definition options.h:140
#define PCMK_OPT_STONITH_WATCHDOG_TIMEOUT
Definition options.h:69
#define PCMK__ENV_PANIC_ACTION
const char * pcmk__env_option(const char *option)
Definition options.c:1085
@ CRM_EX_PANIC
Panic the local host.
Definition results.h:265
@ CRM_EX_FATAL
Do not respawn.
Definition results.h:264
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition results.c:1058
@ pcmk_rc_ok
Definition results.h:159
#define PCMK__SERVER_PACEMAKERD
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition strings.c:351
int crm_str_to_boolean(const char *s, int *ret)
Definition strings.c:498
bool pcmk__starts_with(const char *str, const char *prefix)
Check whether a string starts with a certain sequence.
Definition strings.c:558
@ pcmk__str_none
void pcmk__panic(const char *reason)
Definition watchdog.c:146
pid_t pcmk__locate_sbd(void)
Definition watchdog.c:164
long pcmk__get_sbd_watchdog_timeout(void)
Definition watchdog.c:197
bool pcmk__valid_stonith_watchdog_timeout(const char *value)
Definition watchdog.c:239
long pcmk__auto_stonith_watchdog_timeout(void)
Definition watchdog.c:231
bool pcmk__get_sbd_sync_resource_startup(void)
Definition watchdog.c:208