1 /*
2 * Copyright 2009-2024 the Pacemaker project contributors
3 *
4 * This source code is licensed under the GNU General Public License version 2
5 * or later (GPLv2+) WITHOUT ANY WARRANTY.
6 */
7
8 #include <stdint.h> // uint32_t, uint64_t
9 #include <libxml/tree.h> // xmlNode
10
11 #include <crm/common/mainloop.h>
12 #include <crm/cluster.h>
13 #include <crm/stonith-ng.h>
14 #include <crm/fencing/internal.h>
15
16 /*!
17 * \internal
18 * \brief Check whether target has already been fenced recently
19 *
20 * \param[in] tolerance Number of seconds to look back in time
21 * \param[in] target Name of node to search for
22 * \param[in] action Action we want to match
23 *
24 * \return TRUE if an equivalent fencing operation took place in the last
25 * \p tolerance seconds, FALSE otherwise
26 */
27 gboolean stonith_check_fence_tolerance(int tolerance, const char *target, const char *action);
28
29 typedef struct stonith_device_s {
30 char *id;
31 char *agent;
32 char *namespace;
33
34 /*! list of actions that must execute on the target node. Used for unfencing */
35 GString *on_target_actions;
36 GList *targets;
37 time_t targets_age;
38 gboolean has_attr_map;
39
40 // Whether target's nodeid should be passed as a parameter to the agent
41 gboolean include_nodeid;
42
43 /* whether the cluster should automatically unfence nodes with the device */
44 gboolean automatic_unfencing;
45 guint priority;
46
47 uint32_t flags; // Group of enum st_device_flags
48
49 GHashTable *params;
50 GHashTable *aliases;
51 GList *pending_ops;
52 mainloop_timer_t *timer;
53 crm_trigger_t *work;
54 xmlNode *agent_metadata;
55
56 /*! A verified device is one that has contacted the
57 * agent successfully to perform a monitor operation */
58 gboolean verified;
59
60 gboolean cib_registered;
61 gboolean api_registered;
62 gboolean dirty;
63 } stonith_device_t;
64
65 /* These values are used to index certain arrays by "phase". Usually an
66 * operation has only one "phase", so phase is always zero. However, some
67 * reboots are remapped to "off" then "on", in which case "reboot" will be
68 * phase 0, "off" will be phase 1 and "on" will be phase 2.
69 */
70 enum st_remap_phase {
71 st_phase_requested = 0,
72 st_phase_off = 1,
73 st_phase_on = 2,
74 st_phase_max = 3
75 };
76
77 typedef struct remote_fencing_op_s {
78 /* The unique id associated with this operation */
79 char *id;
80 /*! The node this operation will fence */
81 char *target;
82 /*! The fencing action to perform on the target. (reboot, on, off) */
83 char *action;
84
85 /*! When was the fencing action recorded (seconds since epoch) */
86 time_t created;
87
88 /*! Marks if the final notifications have been sent to local stonith clients. */
89 gboolean notify_sent;
90 /*! The number of query replies received */
91 guint replies;
92 /*! The number of query replies expected */
93 guint replies_expected;
94 /*! Does this node own control of this operation */
95 gboolean owner;
96 /*! After query is complete, This the high level timer that expires the entire operation */
97 guint op_timer_total;
98 /*! This timer expires the current fencing request. Many fencing
99 * requests may exist in a single operation */
100 guint op_timer_one;
101 /*! This timer expires the query request sent out to determine
102 * what nodes are contain what devices, and who those devices can fence */
103 guint query_timer;
104 /*! This is the default timeout to use for each fencing device if no
105 * custom timeout is received in the query. */
106 gint base_timeout;
107 /*! This is the calculated total timeout an operation can take before
108 * expiring. This is calculated by adding together all the timeout
109 * values associated with the devices this fencing operation may call */
110 gint total_timeout;
111
112 /*!
113 * Fencing delay (in seconds) requested by API client (used by controller to
114 * implement \c PCMK_OPT_PRIORITY_FENCING_DELAY). A value of -1 means
115 * disable all configured delays.
116 */
117 int client_delay;
118
119 /*! Delegate is the node being asked to perform a fencing action
120 * on behalf of the node that owns the remote operation. Some operations
121 * will involve multiple delegates. This value represents the final delegate
122 * that is used. */
123 char *delegate;
124 /*! The point at which the remote operation completed */
125 time_t completed;
126 //! Group of enum stonith_call_options associated with this operation
127 uint32_t call_options;
128
129 /*! The current state of the remote operation. This indicates
130 * what stage the op is in, query, exec, done, duplicate, failed. */
131 enum op_state state;
132 /*! The node that owns the remote operation */
133 char *originator;
134 /*! The local client id that initiated the fencing request */
135 char *client_id;
136 /*! The client's call_id that initiated the fencing request */
137 int client_callid;
138 /*! The name of client that initiated the fencing request */
139 char *client_name;
140 /*! List of the received query results for all the nodes in the cpg group */
141 GList *query_results;
142 /*! The original request that initiated the remote stonith operation */
143 xmlNode *request;
144
145 /*! The current topology level being executed */
146 guint level;
147 /*! The current operation phase being executed */
148 enum st_remap_phase phase;
149
150 /*! Devices with automatic unfencing (always run if "on" requested, never if remapped) */
151 GList *automatic_list;
152 /*! List of all devices at the currently executing topology level */
153 GList *devices_list;
154 /*! Current entry in the topology device list */
155 GList *devices;
156
157 /*! List of duplicate operations attached to this operation. Once this operation
158 * completes, the duplicate operations will be closed out as well. */
159 GList *duplicates;
160
161 /*! The point at which the remote operation completed(nsec) */
162 long long completed_nsec;
163
164 /*! The (potentially intermediate) result of the operation */
165 pcmk__action_result_t result;
166 } remote_fencing_op_t;
167
168 void fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged);
169
170 // Fencer-specific client flags
171 enum st_client_flags {
172 st_callback_unknown = UINT64_C(0),
173 st_callback_notify_fence = (UINT64_C(1) << 0),
174 st_callback_device_add = (UINT64_C(1) << 2),
175 st_callback_device_del = (UINT64_C(1) << 4),
176 st_callback_notify_history = (UINT64_C(1) << 5),
177 st_callback_notify_history_synced = (UINT64_C(1) << 6)
178 };
179
180 // How the user specified the target of a topology level
181 enum fenced_target_by {
182 fenced_target_by_unknown = -1, // Invalid or not yet parsed
183 fenced_target_by_name, // By target name
184 fenced_target_by_pattern, // By a pattern matching target names
185 fenced_target_by_attribute, // By a node attribute/value on target
186 };
187
188 /*
189 * Complex fencing requirements are specified via fencing topologies.
190 * A topology consists of levels; each level is a list of fencing devices.
191 * Topologies are stored in a hash table by node name. When a node needs to be
192 * fenced, if it has an entry in the topology table, the levels are tried
193 * sequentially, and the devices in each level are tried sequentially.
194 * Fencing is considered successful as soon as any level succeeds;
195 * a level is considered successful if all its devices succeed.
196 * Essentially, all devices at a given level are "and-ed" and the
197 * levels are "or-ed".
198 *
199 * This structure is used for the topology table entries.
200 * Topology levels start from 1, so levels[0] is unused and always NULL.
201 */
202 typedef struct stonith_topology_s {
203 enum fenced_target_by kind; // How target was specified
204
205 /*! Node name regex or attribute name=value for which topology applies */
206 char *target;
207 char *target_value;
208 char *target_pattern;
209 char *target_attribute;
210
211 /*! Names of fencing devices at each topology level */
212 GList *levels[ST__LEVEL_COUNT];
213
214 } stonith_topology_t;
215
216 void stonith_shutdown(int nsig);
217
218 void init_device_list(void);
219 void free_device_list(void);
220 void init_topology_list(void);
221 void free_topology_list(void);
222 void free_stonith_remote_op_list(void);
223 void init_stonith_remote_op_hash_table(GHashTable **table);
224 void free_metadata_cache(void);
225 void fenced_unregister_handlers(void);
226
227 uint64_t get_stonith_flag(const char *name);
228
229 void stonith_command(pcmk__client_t *client, uint32_t id, uint32_t flags,
230 xmlNode *op_request, const char *remote_peer);
231
232 int stonith_device_register(xmlNode *msg, gboolean from_cib);
233
234 void stonith_device_remove(const char *id, bool from_cib);
235
236 char *stonith_level_key(const xmlNode *msg, enum fenced_target_by);
237 void fenced_register_level(xmlNode *msg, char **desc,
238 pcmk__action_result_t *result);
239 void fenced_unregister_level(xmlNode *msg, char **desc,
240 pcmk__action_result_t *result);
241
242 stonith_topology_t *find_topology_for_host(const char *host);
243
244 void do_local_reply(const xmlNode *notify_src, pcmk__client_t *client,
245 int call_options);
246
247 xmlNode *fenced_construct_reply(const xmlNode *request, xmlNode *data,
248 const pcmk__action_result_t *result);
249
250 void
251 do_stonith_async_timeout_update(const char *client, const char *call_id, int timeout);
252
253 void fenced_send_notification(const char *type,
254 const pcmk__action_result_t *result,
255 xmlNode *data);
256 void fenced_send_config_notification(const char *op,
257 const pcmk__action_result_t *result,
258 const char *desc);
259
260 remote_fencing_op_t *initiate_remote_stonith_op(const pcmk__client_t *client,
261 xmlNode *request,
262 gboolean manual_ack);
263
264 void fenced_process_fencing_reply(xmlNode *msg);
265
266 int process_remote_stonith_query(xmlNode * msg);
267
268 void *create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer);
269
270 void stonith_fence_history(xmlNode *msg, xmlNode **output,
271 const char *remote_peer, int options);
272
273 void stonith_fence_history_trim(void);
274
275 bool fencing_peer_active(pcmk__node_status_t *peer);
276
277 void set_fencing_completed(remote_fencing_op_t * op);
278
279 int fenced_handle_manual_confirmation(const pcmk__client_t *client,
280 xmlNode *msg);
281
282 const char *fenced_device_reboot_action(const char *device_id);
283 bool fenced_device_supports_on(const char *device_id);
284
285 gboolean node_has_attr(const char *node, const char *name, const char *value);
286
287 gboolean node_does_watchdog_fencing(const char *node);
288
289 void fencing_topology_init(void);
290 void setup_cib(void);
291 void fenced_cib_cleanup(void);
292
293 int fenced_scheduler_init(void);
294 void fenced_set_local_node(const char *node_name);
295 const char *fenced_get_local_node(void);
296 void fenced_scheduler_cleanup(void);
297 void fenced_scheduler_run(xmlNode *cib);
298
299 static inline void
300 fenced_set_protocol_error(pcmk__action_result_t *result)
/* ![[previous]](../icons/n_left.png)
![[next]](../icons/right.png)
![[first]](../icons/n_first.png)
![[last]](../icons/last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
301 {
302 pcmk__set_result(result, CRM_EX_PROTOCOL, PCMK_EXEC_INVALID,
303 "Fencer API request missing required information (bug?)");
304 }
305
306 /*!
307 * \internal
308 * \brief Get the device flag to use with a given action when searching devices
309 *
310 * \param[in] action Action to check
311 *
312 * \return st_device_supports_on if \p action is "on", otherwise
313 * st_device_supports_none
314 */
315 static inline uint32_t
316 fenced_support_flag(const char *action)
/* ![[previous]](../icons/left.png)
![[next]](../icons/n_right.png)
![[first]](../icons/first.png)
![[last]](../icons/n_last.png)
![[top]](../icons/top.png)
![[bottom]](../icons/bottom.png)
![[index]](../icons/index.png)
*/
317 {
318 if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
319 return st_device_supports_on;
320 }
321 return st_device_supports_none;
322 }
323
324 extern GHashTable *device_list;
325 extern GHashTable *topology;
326 extern long long stonith_watchdog_timeout_ms;
327 extern GList *stonith_watchdog_targets;
328 extern GHashTable *stonith_remote_op_list;
329 extern crm_exit_t exit_code;
330 extern gboolean stonith_shutdown_flag;