pacemaker  2.1.0-7c3f660
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2020 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 #ifdef _POSIX_MEMLOCK
24 # include <sys/mman.h>
25 #endif
26 
27 static pid_t sbd_pid = 0;
28 
29 static void
30 sysrq_trigger(char t)
31 {
32 #if SUPPORT_PROCFS
33  FILE *procf;
34 
35  // Root can always write here, regardless of kernel.sysrq value
36  procf = fopen("/proc/sysrq-trigger", "a");
37  if (!procf) {
38  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
39  return;
40  }
41  crm_info("sysrq-trigger: %c", t);
42  fprintf(procf, "%c\n", t);
43  fclose(procf);
44 #endif // SUPPORT_PROCFS
45  return;
46 }
47 
48 
53 static void
54 panic_local(void)
55 {
56  int rc = pcmk_ok;
57  uid_t uid = geteuid();
58  pid_t ppid = getppid();
59 
60  if(uid != 0 && ppid > 1) {
61  /* We're a non-root pacemaker daemon (pacemaker-based,
62  * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
63  * the original pacemakerd parent.
64  *
65  * Of these, only the controller is likely to be initiating resets.
66  */
67  crm_emerg("Signaling parent %lld to panic", (long long) ppid);
69  return;
70 
71  } else if (uid != 0) {
72 #if SUPPORT_PROCFS
73  /*
74  * No permissions, and no pacemakerd parent to escalate to.
75  * Track down the new pacemakerd process and send a signal instead.
76  */
77  union sigval signal_value;
78 
79  memset(&signal_value, 0, sizeof(signal_value));
80  ppid = pcmk__procfs_pid_of("pacemakerd");
81  crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
82 
83  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
84  crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
85  (long long) ppid);
86  }
87 #endif // SUPPORT_PROCFS
88 
89  /* The best we can do now is die */
91  return;
92  }
93 
94  /* We're either pacemakerd, or a pacemaker daemon running as root */
95 
96  if (pcmk__str_eq("crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
97  sysrq_trigger('c');
98  } else if (pcmk__str_eq("sync-crash", getenv("PCMK_panic_action"), pcmk__str_casei)) {
99  sync();
100  sysrq_trigger('c');
101  } else {
102  if (pcmk__str_eq("sync-reboot", getenv("PCMK_panic_action"), pcmk__str_casei)) {
103  sync();
104  }
105  sysrq_trigger('b');
106  }
107  /* reboot(RB_HALT_SYSTEM); rc = errno; */
108  reboot(RB_AUTOBOOT);
109  rc = errno;
110 
111  crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
112  (long long) ppid, pcmk_rc_str(rc), rc);
113 
114  if(ppid > 1) {
115  /* child daemon */
116  exit(CRM_EX_PANIC);
117  } else {
118  /* pacemakerd or orphan child */
119  exit(CRM_EX_FATAL);
120  }
121 }
122 
127 static void
128 panic_sbd(void)
129 {
130  union sigval signal_value;
131  pid_t ppid = getppid();
132 
133  crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
134 
135  memset(&signal_value, 0, sizeof(signal_value));
136  /* TODO: Arrange for a slightly less brutal option? */
137  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
138  crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
139  (long long) sbd_pid);
140  panic_local();
141  }
142 
143  if(ppid > 1) {
144  /* child daemon */
145  exit(CRM_EX_PANIC);
146  } else {
147  /* pacemakerd or orphan child */
148  exit(CRM_EX_FATAL);
149  }
150 }
151 
161 void
162 pcmk__panic(const char *origin)
163 {
164  static struct qb_log_callsite *panic_cs = NULL;
165 
166  if (panic_cs == NULL) {
167  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay",
168  LOG_TRACE, __LINE__, crm_trace_nonlog);
169  }
170 
171  /* Ensure sbd_pid is set */
172  (void) pcmk__locate_sbd();
173 
174  if (panic_cs && panic_cs->targets) {
175  /* getppid() == 1 means our original parent no longer exists */
176  crm_emerg("Shutting down instead of panicking the node "
177  CRM_XS " origin=%s sbd=%lld parent=%d",
178  origin, (long long) sbd_pid, getppid());
180  return;
181  }
182 
183  if(sbd_pid > 1) {
184  crm_emerg("Signaling sbd[%lld] to panic the system: %s",
185  (long long) sbd_pid, origin);
186  panic_sbd();
187 
188  } else {
189  crm_emerg("Panicking the system directly: %s", origin);
190  panic_local();
191  }
192 }
193 
198 pid_t
200 {
201  char *pidfile = NULL;
202  char *sbd_path = NULL;
203  int rc;
204 
205  if(sbd_pid > 1) {
206  return sbd_pid;
207  }
208 
209  /* Look for the pid file */
210  pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
211  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
212 
213  /* Read the pid file */
214  rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
215  if (rc == pcmk_rc_ok) {
216  crm_trace("SBD detected at pid %lld (via PID file %s)",
217  (long long) sbd_pid, pidfile);
218 
219 #if SUPPORT_PROCFS
220  } else {
221  /* Fall back to /proc for systems that support it */
222  sbd_pid = pcmk__procfs_pid_of("sbd");
223  crm_trace("SBD detected at pid %lld (via procfs)",
224  (long long) sbd_pid);
225 #endif // SUPPORT_PROCFS
226  }
227 
228  if(sbd_pid < 0) {
229  sbd_pid = 0;
230  crm_trace("SBD not detected");
231  }
232 
233  free(pidfile);
234  free(sbd_path);
235 
236  return sbd_pid;
237 }
238 
239 long
241 {
242  static long sbd_timeout = -2;
243 
244  if (sbd_timeout == -2) {
245  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
246  }
247  return sbd_timeout;
248 }
249 
250 bool
252 {
253  static int sync_resource_startup = PCMK__SBD_SYNC_DEFAULT;
254  static bool checked_sync_resource_startup = false;
255 
256  if (!checked_sync_resource_startup) {
257  const char *sync_env = getenv("SBD_SYNC_RESOURCE_STARTUP");
258 
259  if (sync_env == NULL) {
260  crm_trace("Defaulting to %sstart-up synchronization with sbd",
261  (PCMK__SBD_SYNC_DEFAULT? "" : "no "));
262 
263  } else if (crm_str_to_boolean(sync_env, &sync_resource_startup) < 0) {
264  crm_warn("Defaulting to %sstart-up synchronization with sbd "
265  "because environment value '%s' is invalid",
266  (PCMK__SBD_SYNC_DEFAULT? "" : "no "), sync_env);
267  }
268  checked_sync_resource_startup = true;
269  }
270  return sync_resource_startup != 0;
271 }
272 
273 long
275 {
276  long sbd_timeout = pcmk__get_sbd_timeout();
277 
278  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
279 }
280 
281 bool
282 pcmk__valid_sbd_timeout(const char *value)
283 {
284  long st_timeout = value? crm_get_msec(value) : 0;
285 
286  if (st_timeout < 0) {
287  st_timeout = pcmk__auto_watchdog_timeout();
288  crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
289  st_timeout, value);
290  }
291 
292  if (st_timeout == 0) {
293  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
294  value? value : "default");
295 
296  } else if (pcmk__locate_sbd() == 0) {
297  crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
298  "but SBD not active", (value? value : "auto"));
300  return false;
301 
302  } else {
303  long sbd_timeout = pcmk__get_sbd_timeout();
304 
305  if (st_timeout < sbd_timeout) {
306  crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
307  "(must be >%ldms)", value, sbd_timeout);
309  return false;
310  }
311  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
312  value, sbd_timeout);
313  }
314  return true;
315 }
#define LOG_TRACE
Definition: logging.h:36
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:759
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition: strings.c:363
bool pcmk__valid_sbd_timeout(const char *value)
Definition: watchdog.c:282
unsigned int crm_trace_nonlog
Definition: logging.c:46
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition: results.c:420
#define crm_warn(fmt, args...)
Definition: logging.h:351
#define crm_emerg(fmt, args...)
Definition: logging.h:348
int rc
Definition: pcmk_fence.c:35
#define crm_debug(fmt, args...)
Definition: logging.h:355
long pcmk__auto_watchdog_timeout(void)
Definition: watchdog.c:274
#define crm_trace(fmt, args...)
Definition: logging.h:356
#define PCMK_RUN_DIR
Definition: config.h:505
char * crm_strdup_printf(char const *format,...) G_GNUC_PRINTF(1
pid_t pcmk__locate_sbd(void)
Definition: watchdog.c:199
pid_t pcmk__procfs_pid_of(const char *name)
Definition: procfs.c:111
#define CRM_XS
Definition: logging.h:54
int crm_str_to_boolean(const char *s, int *ret)
Definition: strings.c:426
#define PCMK__SBD_SYNC_DEFAULT
Definition: config.h:520
#define SBIN_DIR
Definition: config.h:526
bool pcmk__get_sbd_sync_resource_startup(void)
Definition: watchdog.c:251
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition: logging.h:301
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition: pid.c:177
#define pcmk_ok
Definition: results.h:67
void pcmk__panic(const char *origin)
Definition: watchdog.c:162
#define crm_info(fmt, args...)
Definition: logging.h:353
long pcmk__get_sbd_timeout(void)
Definition: watchdog.c:240