pacemaker  2.0.4-2deceaa
Scalable High-Availability cluster resource manager
 All Data Structures Files Functions Variables Typedefs Enumerations Enumerator Macros Groups Pages
watchdog.c
Go to the documentation of this file.
1 /*
2  * Copyright 2013-2020 the Pacemaker project contributors
3  *
4  * The version control history for this file may have further details.
5  *
6  * This source code is licensed under the GNU Lesser General Public License
7  * version 2.1 or later (LGPLv2.1+) WITHOUT ANY WARRANTY.
8  */
9 
10 #include <crm_internal.h>
11 
12 #include <sched.h>
13 #include <sys/ioctl.h>
14 #include <sys/reboot.h>
15 
16 #include <sys/types.h>
17 #include <sys/stat.h>
18 #include <unistd.h>
19 #include <ctype.h>
20 #include <dirent.h>
21 #include <signal.h>
22 
23 #ifdef _POSIX_MEMLOCK
24 # include <sys/mman.h>
25 #endif
26 
27 static pid_t sbd_pid = 0;
28 
30 {
35 };
36 
37 static void
38 sysrq_trigger(char t)
39 {
40 #if SUPPORT_PROCFS
41  FILE *procf;
42 
43  // Root can always write here, regardless of kernel.sysrq value
44  procf = fopen("/proc/sysrq-trigger", "a");
45  if (!procf) {
46  crm_perror(LOG_WARNING, "Opening sysrq-trigger failed");
47  return;
48  }
49  crm_info("sysrq-trigger: %c", t);
50  fprintf(procf, "%c\n", t);
51  fclose(procf);
52 #endif // SUPPORT_PROCFS
53  return;
54 }
55 
56 
57 static void
58 pcmk_panic_local(void)
59 {
60  int rc = pcmk_ok;
61  uid_t uid = geteuid();
62  pid_t ppid = getppid();
63 
64  if(uid != 0 && ppid > 1) {
65  /* We're a non-root pacemaker daemon (pacemaker-based,
66  * pacemaker-controld, pacemaker-schedulerd, pacemaker-attrd, etc.) with
67  * the original pacemakerd parent.
68  *
69  * Of these, only the controller is likely to be initiating resets.
70  */
71  crm_emerg("Signaling parent %lld to panic", (long long) ppid);
73  return;
74 
75  } else if (uid != 0) {
76 #if SUPPORT_PROCFS
77  /*
78  * No permissions, and no pacemakerd parent to escalate to.
79  * Track down the new pacemakerd process and send a signal instead.
80  */
81  union sigval signal_value;
82 
83  memset(&signal_value, 0, sizeof(signal_value));
84  ppid = pcmk__procfs_pid_of("pacemakerd");
85  crm_emerg("Signaling pacemakerd[%lld] to panic", (long long) ppid);
86 
87  if(ppid > 1 && sigqueue(ppid, SIGQUIT, signal_value) < 0) {
88  crm_perror(LOG_EMERG, "Cannot signal pacemakerd[%lld] to panic",
89  (long long) ppid);
90  }
91 #endif // SUPPORT_PROCFS
92 
93  /* The best we can do now is die */
95  return;
96  }
97 
98  /* We're either pacemakerd, or a pacemaker daemon running as root */
99 
100  if (safe_str_eq("crash", getenv("PCMK_panic_action"))) {
101  sysrq_trigger('c');
102  } else {
103  sysrq_trigger('b');
104  }
105  /* reboot(RB_HALT_SYSTEM); rc = errno; */
106  reboot(RB_AUTOBOOT);
107  rc = errno;
108 
109  crm_emerg("Reboot failed, escalating to parent %lld: %s " CRM_XS " rc=%d",
110  (long long) ppid, pcmk_rc_str(rc), rc);
111 
112  if(ppid > 1) {
113  /* child daemon */
114  exit(CRM_EX_PANIC);
115  } else {
116  /* pacemakerd or orphan child */
117  exit(CRM_EX_FATAL);
118  }
119 }
120 
121 static void
122 pcmk_panic_sbd(void)
123 {
124  union sigval signal_value;
125  pid_t ppid = getppid();
126 
127  crm_emerg("Signaling sbd[%lld] to panic", (long long) sbd_pid);
128 
129  memset(&signal_value, 0, sizeof(signal_value));
130  /* TODO: Arrange for a slightly less brutal option? */
131  if(sigqueue(sbd_pid, SIGKILL, signal_value) < 0) {
132  crm_perror(LOG_EMERG, "Cannot signal sbd[%lld] to terminate",
133  (long long) sbd_pid);
134  pcmk_panic_local();
135  }
136 
137  if(ppid > 1) {
138  /* child daemon */
139  exit(CRM_EX_PANIC);
140  } else {
141  /* pacemakerd or orphan child */
142  exit(CRM_EX_FATAL);
143  }
144 }
145 
146 void
147 pcmk_panic(const char *origin)
148 {
149  static struct qb_log_callsite *panic_cs = NULL;
150 
151  if (panic_cs == NULL) {
152  panic_cs = qb_log_callsite_get(__func__, __FILE__, "panic-delay", LOG_TRACE, __LINE__, crm_trace_nonlog);
153  }
154 
155  /* Ensure sbd_pid is set */
156  (void)pcmk_locate_sbd();
157 
158  if (panic_cs && panic_cs->targets) {
159  /* getppid() == 1 means our original parent no longer exists */
160  crm_emerg("Shutting down instead of panicking the node "
161  CRM_XS " origin=%s sbd=%lld parent=%d",
162  origin, (long long) sbd_pid, getppid());
164  return;
165  }
166 
167  if(sbd_pid > 1) {
168  crm_emerg("Signaling sbd[%lld] to panic the system: %s",
169  (long long) sbd_pid, origin);
170  pcmk_panic_sbd();
171 
172  } else {
173  crm_emerg("Panicking the system directly: %s", origin);
174  pcmk_panic_local();
175  }
176 }
177 
178 pid_t
180 {
181  char *pidfile = NULL;
182  char *sbd_path = NULL;
183  int rc;
184 
185  if(sbd_pid > 1) {
186  return sbd_pid;
187  }
188 
189  /* Look for the pid file */
190  pidfile = crm_strdup_printf(PCMK_RUN_DIR "/sbd.pid");
191  sbd_path = crm_strdup_printf("%s/sbd", SBIN_DIR);
192 
193  /* Read the pid file */
194  rc = pcmk__pidfile_matches(pidfile, 0, sbd_path, &sbd_pid);
195  if (rc == pcmk_rc_ok) {
196  crm_trace("SBD detected at pid %lld (via PID file %s)",
197  (long long) sbd_pid, pidfile);
198 
199 #if SUPPORT_PROCFS
200  } else {
201  /* Fall back to /proc for systems that support it */
202  sbd_pid = pcmk__procfs_pid_of("sbd");
203  crm_trace("SBD detected at pid %lld (via procfs)",
204  (long long) sbd_pid);
205 #endif // SUPPORT_PROCFS
206  }
207 
208  if(sbd_pid < 0) {
209  sbd_pid = 0;
210  crm_trace("SBD not detected");
211  }
212 
213  free(pidfile);
214  free(sbd_path);
215 
216  return sbd_pid;
217 }
218 
219 long
221 {
222  static long sbd_timeout = -2;
223 
224  if (sbd_timeout == -2) {
225  sbd_timeout = crm_get_msec(getenv("SBD_WATCHDOG_TIMEOUT"));
226  }
227  return sbd_timeout;
228 }
229 
230 long
232 {
233  long sbd_timeout = pcmk__get_sbd_timeout();
234 
235  return (sbd_timeout <= 0)? 0 : (2 * sbd_timeout);
236 }
237 
238 bool
239 pcmk__valid_sbd_timeout(const char *value)
240 {
241  long st_timeout = value? crm_get_msec(value) : 0;
242 
243  if (st_timeout < 0) {
244  st_timeout = pcmk__auto_watchdog_timeout();
245  crm_debug("Using calculated value %ld for stonith-watchdog-timeout (%s)",
246  st_timeout, value);
247  }
248 
249  if (st_timeout == 0) {
250  crm_debug("Watchdog may be enabled but stonith-watchdog-timeout is disabled (%s)",
251  value? value : "default");
252 
253  } else if (pcmk_locate_sbd() == 0) {
254  crm_emerg("Shutting down: stonith-watchdog-timeout configured (%s) "
255  "but SBD not active", (value? value : "auto"));
257  return false;
258 
259  } else {
260  long sbd_timeout = pcmk__get_sbd_timeout();
261 
262  if (st_timeout < sbd_timeout) {
263  crm_emerg("Shutting down: stonith-watchdog-timeout (%s) too short "
264  "(must be >%ldms)", value, sbd_timeout);
266  return false;
267  }
268  crm_info("Watchdog configured with stonith-watchdog-timeout %s and SBD timeout %ldms",
269  value, sbd_timeout);
270  }
271  return true;
272 }
#define LOG_TRACE
Definition: logging.h:36
_Noreturn crm_exit_t crm_exit(crm_exit_t rc)
Definition: results.c:751
long long crm_get_msec(const char *input)
Parse a time+units string and return milliseconds equivalent.
Definition: strings.c:211
bool pcmk__valid_sbd_timeout(const char *value)
Definition: watchdog.c:239
unsigned int crm_trace_nonlog
Definition: logging.c:39
const char * pcmk_rc_str(int rc)
Get a user-friendly description of a return code.
Definition: results.c:413
pcmk_panic_flags
Definition: watchdog.c:29
#define crm_emerg(fmt, args...)
Definition: logging.h:361
int rc
Definition: pcmk_fence.c:34
#define crm_debug(fmt, args...)
Definition: logging.h:368
long pcmk__auto_watchdog_timeout(void)
Definition: watchdog.c:231
#define crm_trace(fmt, args...)
Definition: logging.h:369
#define PCMK_RUN_DIR
Definition: config.h:536
void pcmk_panic(const char *origin)
Definition: watchdog.c:147
pid_t pcmk__procfs_pid_of(const char *name)
Definition: procfs.c:111
#define CRM_XS
Definition: logging.h:54
#define SBIN_DIR
Definition: config.h:548
#define crm_perror(level, fmt, args...)
Send a system error message to both the log and stderr.
Definition: logging.h:314
int pcmk__pidfile_matches(const char *filename, pid_t expected_pid, const char *expected_name, pid_t *pid)
Definition: pid.c:172
#define pcmk_ok
Definition: results.h:67
pid_t pcmk_locate_sbd(void)
Definition: watchdog.c:179
#define safe_str_eq(a, b)
Definition: util.h:65
char * crm_strdup_printf(char const *format,...) __attribute__((__format__(__printf__
#define crm_info(fmt, args...)
Definition: logging.h:366
long pcmk__get_sbd_timeout(void)
Definition: watchdog.c:220