This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- fencing_result2xml
- fenced_broadcast_op_result
- handle_local_reply_and_notify
- finalize_op_duplicates
- delegate_from_xml
- finalize_op
- remote_op_watchdog_done
- remote_op_timeout_one
- finalize_timed_out_op
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- fenced_handle_manual_confirmation
- create_remote_stonith_op
- initiate_remote_stonith_op
- is_watchdog_fencing
- find_best_peer
- stonith_choose_peer
- valid_fencing_timeout
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- check_watchdog_fencing_and_wait
- request_peer_fencing
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- fenced_process_fencing_reply
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <libxml/tree.h>
27
28 #include <crm/crm.h>
29 #include <crm/common/ipc.h>
30 #include <crm/common/ipc_internal.h>
31 #include <crm/cluster/internal.h>
32
33 #include <crm/stonith-ng.h>
34 #include <crm/fencing/internal.h>
35 #include <crm/common/xml.h>
36 #include <crm/common/xml_internal.h>
37
38 #include <crm/common/util.h>
39 #include <pacemaker-fenced.h>
40
41 #define TIMEOUT_MULTIPLY_FACTOR 1.2
42
43
44
45
46
47
48
49 typedef struct device_properties_s {
50
51 gboolean verified;
52
53
54
55
56 gboolean executed[st_phase_max];
57
58 gboolean disallowed[st_phase_max];
59
60 int custom_action_timeout[st_phase_max];
61
62 int delay_max[st_phase_max];
63
64 int delay_base[st_phase_max];
65
66 uint32_t device_support_flags;
67 } device_properties_t;
68
69 typedef struct {
70
71 char *host;
72
73 gboolean tried;
74
75 int ndevices;
76
77 GHashTable *devices;
78 } peer_device_info_t;
79
80 GHashTable *stonith_remote_op_list = NULL;
81
82 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
83 int call_options);
84
85 static void request_peer_fencing(remote_fencing_op_t *op,
86 peer_device_info_t *peer);
87 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
88 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
89 static int get_op_total_timeout(const remote_fencing_op_t *op,
90 const peer_device_info_t *chosen_peer);
91
92 static gint
93 sort_strings(gconstpointer a, gconstpointer b)
94 {
95 return strcmp(a, b);
96 }
97
98 static void
99 free_remote_query(gpointer data)
100 {
101 if (data != NULL) {
102 peer_device_info_t *peer = data;
103
104 g_hash_table_destroy(peer->devices);
105 free(peer->host);
106 free(peer);
107 }
108 }
109
110 void
111 free_stonith_remote_op_list(void)
112 {
113 if (stonith_remote_op_list != NULL) {
114 g_hash_table_destroy(stonith_remote_op_list);
115 stonith_remote_op_list = NULL;
116 }
117 }
118
119 struct peer_count_data {
120 const remote_fencing_op_t *op;
121 gboolean verified_only;
122 uint32_t support_action_only;
123 int count;
124 };
125
126
127
128
129
130
131
132
133
134 static void
135 count_peer_device(gpointer key, gpointer value, gpointer user_data)
136 {
137 device_properties_t *props = (device_properties_t*)value;
138 struct peer_count_data *data = user_data;
139
140 if (!props->executed[data->op->phase]
141 && (!data->verified_only || props->verified)
142 && ((data->support_action_only == fenced_df_none)
143 || pcmk_is_set(props->device_support_flags,
144 data->support_action_only))) {
145 ++(data->count);
146 }
147 }
148
149
150
151
152
153
154
155
156
157
158
159
160 static int
161 count_peer_devices(const remote_fencing_op_t *op,
162 const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
163 {
164 struct peer_count_data data;
165
166 data.op = op;
167 data.verified_only = verified_only;
168 data.support_action_only = support_on_action_only;
169 data.count = 0;
170 if (peer) {
171 g_hash_table_foreach(peer->devices, count_peer_device, &data);
172 }
173 return data.count;
174 }
175
176
177
178
179
180
181
182
183
184
185
186 static device_properties_t *
187 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
188 const char *device, uint32_t support_action_only)
189 {
190 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
191
192 if (props == NULL) {
193 return NULL;
194 }
195 if ((support_action_only != fenced_df_none)
196 && !pcmk_is_set(props->device_support_flags, support_action_only)) {
197 return NULL;
198 }
199 if (props->executed[op->phase] || props->disallowed[op->phase]) {
200 return NULL;
201 }
202 return props;
203 }
204
205
206
207
208
209
210
211
212
213
214
215
216 static gboolean
217 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
218 const char *device, gboolean verified_devices_only)
219 {
220 device_properties_t *props = find_peer_device(op, peer, device,
221 fenced_support_flag(op->action));
222
223 if ((props == NULL) || (verified_devices_only && !props->verified)) {
224 return FALSE;
225 }
226
227 crm_trace("Removing %s from %s (%d remaining)",
228 device, peer->host,
229 count_peer_devices(op, peer, FALSE, fenced_df_none));
230 props->executed[op->phase] = TRUE;
231 return TRUE;
232 }
233
234 static void
235 clear_remote_op_timers(remote_fencing_op_t * op)
236 {
237 if (op->query_timer) {
238 g_source_remove(op->query_timer);
239 op->query_timer = 0;
240 }
241 if (op->op_timer_total) {
242 g_source_remove(op->op_timer_total);
243 op->op_timer_total = 0;
244 }
245 if (op->op_timer_one) {
246 g_source_remove(op->op_timer_one);
247 op->op_timer_one = 0;
248 }
249 }
250
251 static void
252 free_remote_op(gpointer data)
253 {
254 remote_fencing_op_t *op = data;
255
256 crm_log_xml_debug(op->request, "Destroying");
257
258 clear_remote_op_timers(op);
259
260 free(op->id);
261 free(op->action);
262 free(op->delegate);
263 free(op->target);
264 free(op->client_id);
265 free(op->client_name);
266 free(op->originator);
267
268 if (op->query_results) {
269 g_list_free_full(op->query_results, free_remote_query);
270 }
271 if (op->request) {
272 pcmk__xml_free(op->request);
273 op->request = NULL;
274 }
275 if (op->devices_list) {
276 g_list_free_full(op->devices_list, free);
277 op->devices_list = NULL;
278 }
279 g_list_free_full(op->automatic_list, free);
280 g_list_free(op->duplicates);
281
282 pcmk__reset_result(&op->result);
283 free(op);
284 }
285
286 void
287 init_stonith_remote_op_hash_table(GHashTable **table)
288 {
289 if (*table == NULL) {
290 *table = pcmk__strkey_table(NULL, free_remote_op);
291 }
292 }
293
294
295
296
297
298
299
300
301
302 static const char *
303 op_requested_action(const remote_fencing_op_t *op)
304 {
305 return ((op->phase > st_phase_requested)? PCMK_ACTION_REBOOT : op->action);
306 }
307
308
309
310
311
312
313
314 static void
315 op_phase_off(remote_fencing_op_t *op)
316 {
317 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
318 QB_XS " id=%.8s", op->target, op->id);
319 op->phase = st_phase_off;
320
321
322
323
324 strcpy(op->action, PCMK_ACTION_OFF);
325 }
326
327
328
329
330
331
332
333 static void
334 op_phase_on(remote_fencing_op_t *op)
335 {
336 GList *iter = NULL;
337
338 crm_info("Remapped 'off' targeting %s complete, "
339 "remapping to 'on' for %s " QB_XS " id=%.8s",
340 op->target, op->client_name, op->id);
341 op->phase = st_phase_on;
342 strcpy(op->action, PCMK_ACTION_ON);
343
344
345
346
347 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
348 GList *match = g_list_find_custom(op->devices_list, iter->data,
349 sort_strings);
350
351 if (match) {
352 op->devices_list = g_list_remove(op->devices_list, match->data);
353 }
354 }
355 g_list_free_full(op->automatic_list, free);
356 op->automatic_list = NULL;
357
358
359 op->devices = op->devices_list;
360 }
361
362
363
364
365
366
367
368 static void
369 undo_op_remap(remote_fencing_op_t *op)
370 {
371 if (op->phase > 0) {
372 crm_info("Undoing remap of reboot targeting %s for %s "
373 QB_XS " id=%.8s", op->target, op->client_name, op->id);
374 op->phase = st_phase_requested;
375 strcpy(op->action, PCMK_ACTION_REBOOT);
376 }
377 }
378
379
380
381
382
383
384
385
386
387
388
389 static xmlNode *
390 fencing_result2xml(xmlNode *parent, const remote_fencing_op_t *op)
391 {
392 xmlNode *notify_data = pcmk__xe_create(parent, PCMK__XE_ST_NOTIFY_FENCE);
393
394 crm_xml_add_int(notify_data, PCMK_XA_STATE, op->state);
395 crm_xml_add(notify_data, PCMK__XA_ST_TARGET, op->target);
396 crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ACTION, op->action);
397 crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, op->delegate);
398 crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, op->id);
399 crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, op->originator);
400 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTID, op->client_id);
401 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTNAME, op->client_name);
402
403 return notify_data;
404 }
405
406
407
408
409
410
411
412
413 void
414 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
415 {
416 static int count = 0;
417 xmlNode *bcast = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY);
418 xmlNode *wrapper = NULL;
419 xmlNode *notify_data = NULL;
420
421 count++;
422 crm_trace("Broadcasting result to peers");
423 crm_xml_add(bcast, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY);
424 crm_xml_add(bcast, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST);
425 crm_xml_add(bcast, PCMK__XA_ST_OP, STONITH_OP_NOTIFY);
426 crm_xml_add_int(bcast, PCMK_XA_COUNT, count);
427
428 if (op_merged) {
429 pcmk__xe_set_bool_attr(bcast, PCMK__XA_ST_OP_MERGED, true);
430 }
431
432 wrapper = pcmk__xe_create(bcast, PCMK__XE_ST_CALLDATA);
433 notify_data = fencing_result2xml(wrapper, op);
434 stonith__xe_set_result(notify_data, &op->result);
435
436 pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, bcast);
437 pcmk__xml_free(bcast);
438
439 return;
440 }
441
442
443
444
445
446
447
448
449 static void
450 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
451 {
452 xmlNode *notify_data = NULL;
453 xmlNode *reply = NULL;
454 pcmk__client_t *client = NULL;
455
456 if (op->notify_sent == TRUE) {
457
458 return;
459 }
460
461
462 crm_xml_add_int(data, PCMK_XA_STATE, op->state);
463 crm_xml_add(data, PCMK__XA_ST_TARGET, op->target);
464 crm_xml_add(data, PCMK__XA_ST_OP, op->action);
465
466 reply = fenced_construct_reply(op->request, data, &op->result);
467 crm_xml_add(reply, PCMK__XA_ST_DELEGATE, op->delegate);
468
469
470 client = pcmk__find_client_by_id(op->client_id);
471 if (client == NULL) {
472 crm_trace("Skipping reply to %s: no longer a client", op->client_id);
473 } else {
474 do_local_reply(reply, client, op->call_options);
475 }
476
477
478 notify_data = fencing_result2xml(NULL, op);
479 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, &op->result,
480 notify_data);
481 pcmk__xml_free(notify_data);
482 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
483
484
485 op->notify_sent = TRUE;
486 pcmk__xml_free(reply);
487 }
488
489
490
491
492
493
494
495
496 static void
497 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
498 {
499 for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
500 remote_fencing_op_t *other = iter->data;
501
502 if (other->state == st_duplicate) {
503 other->state = op->state;
504 crm_debug("Performing duplicate notification for %s@%s: %s "
505 QB_XS " id=%.8s",
506 other->client_name, other->originator,
507 pcmk_exec_status_str(op->result.execution_status),
508 other->id);
509 pcmk__copy_result(&op->result, &other->result);
510 finalize_op(other, data, true);
511
512 } else {
513
514 crm_err("Skipping duplicate notification for %s@%s "
515 QB_XS " state=%s id=%.8s",
516 other->client_name, other->originator,
517 stonith__op_state_text(other->state), other->id);
518 }
519 }
520 }
521
522 static char *
523 delegate_from_xml(xmlNode *xml)
524 {
525 xmlNode *match = pcmk__xpath_find_one(xml->doc,
526 "//*[@" PCMK__XA_ST_DELEGATE "]",
527 LOG_NEVER);
528
529 if (match == NULL) {
530 return crm_element_value_copy(xml, PCMK__XA_SRC);
531 } else {
532 return crm_element_value_copy(match, PCMK__XA_ST_DELEGATE);
533 }
534 }
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552 static void
553 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
554 {
555 int level = LOG_ERR;
556 const char *subt = NULL;
557 xmlNode *local_data = NULL;
558 gboolean op_merged = FALSE;
559
560 CRM_CHECK((op != NULL), return);
561
562
563 clear_remote_op_timers(op);
564
565 if (op->notify_sent) {
566
567 crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
568 "Result arrived too late " QB_XS " id=%.8s",
569 op->action, (op->target? " targeting " : ""),
570 (op->target? op->target : ""),
571 (op->delegate? op->delegate : "unknown node"),
572 op->client_name, op->originator,
573 (op_merged? " (merged)" : ""),
574 op->id);
575 return;
576 }
577
578 set_fencing_completed(op);
579 undo_op_remap(op);
580
581 if (data == NULL) {
582 data = pcmk__xe_create(NULL, "remote-op");
583 local_data = data;
584
585 } else if (op->delegate == NULL) {
586 switch (op->result.execution_status) {
587 case PCMK_EXEC_NO_FENCE_DEVICE:
588 break;
589
590 case PCMK_EXEC_INVALID:
591 if (op->result.exit_status != CRM_EX_EXPIRED) {
592 op->delegate = delegate_from_xml(data);
593 }
594 break;
595
596 default:
597 op->delegate = delegate_from_xml(data);
598 break;
599 }
600 }
601
602 if (dup || (crm_element_value(data, PCMK__XA_ST_OP_MERGED) != NULL)) {
603 op_merged = true;
604 }
605
606
607
608
609 subt = crm_element_value(data, PCMK__XA_SUBT);
610 if (!dup && !pcmk__str_eq(subt, PCMK__VALUE_BROADCAST, pcmk__str_none)) {
611
612 fenced_broadcast_op_result(op, op_merged);
613 pcmk__xml_free(local_data);
614 return;
615 }
616
617 if (pcmk__result_ok(&op->result) || dup
618 || !pcmk__str_eq(op->originator, fenced_get_local_node(),
619 pcmk__str_casei)) {
620 level = LOG_NOTICE;
621 }
622 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
623 QB_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
624 (op->target? op->target : ""),
625 (op->delegate? op->delegate : "unknown node"),
626 op->client_name, op->originator,
627 (op_merged? " (merged)" : ""),
628 crm_exit_str(op->result.exit_status),
629 pcmk_exec_status_str(op->result.execution_status),
630 ((op->result.exit_reason == NULL)? "" : ": "),
631 ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
632 op->id);
633
634 handle_local_reply_and_notify(op, data);
635
636 if (!dup) {
637 finalize_op_duplicates(op, data);
638 }
639
640
641
642
643 if (op->query_results) {
644 g_list_free_full(op->query_results, free_remote_query);
645 op->query_results = NULL;
646 }
647 if (op->request) {
648 pcmk__xml_free(op->request);
649 op->request = NULL;
650 }
651
652 pcmk__xml_free(local_data);
653 }
654
655
656
657
658
659
660
661
662
663 static gboolean
664 remote_op_watchdog_done(gpointer userdata)
665 {
666 remote_fencing_op_t *op = userdata;
667
668 op->op_timer_one = 0;
669
670 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
671 QB_XS " id=%.8s",
672 op->action, op->target, op->client_name, op->id);
673 op->state = st_done;
674 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
675 finalize_op(op, NULL, false);
676 return G_SOURCE_REMOVE;
677 }
678
679 static gboolean
680 remote_op_timeout_one(gpointer userdata)
681 {
682 remote_fencing_op_t *op = userdata;
683
684 op->op_timer_one = 0;
685
686 crm_notice("Peer's '%s' action targeting %s for client %s timed out " QB_XS
687 " id=%.8s", op->action, op->target, op->client_name, op->id);
688 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
689 "Peer did not return fence result within timeout");
690
691
692 if (op->client_delay > 0) {
693 op->client_delay = 0;
694 crm_trace("Try another device for '%s' action targeting %s "
695 "for client %s without delay " QB_XS " id=%.8s",
696 op->action, op->target, op->client_name, op->id);
697 }
698
699
700 request_peer_fencing(op, NULL);
701 return G_SOURCE_REMOVE;
702 }
703
704
705
706
707
708
709
710
711 static void
712 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
713 {
714 crm_debug("Action '%s' targeting %s for client %s timed out "
715 QB_XS " id=%.8s",
716 op->action, op->target, op->client_name, op->id);
717
718 if (op->phase == st_phase_on) {
719
720
721
722
723 op->state = st_done;
724 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
725 } else {
726 op->state = st_failed;
727 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
728 }
729 finalize_op(op, NULL, false);
730 }
731
732
733
734
735
736
737
738
739
740 static gboolean
741 remote_op_timeout(gpointer userdata)
742 {
743 remote_fencing_op_t *op = userdata;
744
745 op->op_timer_total = 0;
746
747 if (op->state == st_done) {
748 crm_debug("Action '%s' targeting %s for client %s already completed "
749 QB_XS " id=%.8s",
750 op->action, op->target, op->client_name, op->id);
751 } else {
752 finalize_timed_out_op(userdata, "Fencing did not complete within a "
753 "total timeout based on the "
754 "configured timeout and retries for "
755 "any devices attempted");
756 }
757 return G_SOURCE_REMOVE;
758 }
759
760 static gboolean
761 remote_op_query_timeout(gpointer data)
762 {
763 remote_fencing_op_t *op = data;
764
765 op->query_timer = 0;
766
767 if (op->state == st_done) {
768 crm_debug("Operation %.8s targeting %s already completed",
769 op->id, op->target);
770 } else if (op->state == st_exec) {
771 crm_debug("Operation %.8s targeting %s already in progress",
772 op->id, op->target);
773 } else if (op->query_results) {
774
775 crm_debug("Query %.8s targeting %s complete (state=%s)",
776 op->id, op->target, stonith__op_state_text(op->state));
777 request_peer_fencing(op, NULL);
778 } else {
779 crm_debug("Query %.8s targeting %s timed out (state=%s)",
780 op->id, op->target, stonith__op_state_text(op->state));
781 finalize_timed_out_op(op, "No capable peers replied to device query "
782 "within timeout");
783 }
784
785 return G_SOURCE_REMOVE;
786 }
787
788 static gboolean
789 topology_is_empty(stonith_topology_t *tp)
790 {
791 int i;
792
793 if (tp == NULL) {
794 return TRUE;
795 }
796
797 for (i = 0; i < ST__LEVEL_COUNT; i++) {
798 if (tp->levels[i] != NULL) {
799 return FALSE;
800 }
801 }
802 return TRUE;
803 }
804
805
806
807
808
809
810
811
812 static void
813 add_required_device(remote_fencing_op_t *op, const char *device)
814 {
815 GList *match = g_list_find_custom(op->automatic_list, device,
816 sort_strings);
817
818 if (!match) {
819 op->automatic_list = g_list_prepend(op->automatic_list,
820 pcmk__str_copy(device));
821 }
822 }
823
824
825
826
827
828
829
830
831 static void
832 remove_required_device(remote_fencing_op_t *op, const char *device)
833 {
834 GList *match = g_list_find_custom(op->automatic_list, device,
835 sort_strings);
836
837 if (match) {
838 op->automatic_list = g_list_remove(op->automatic_list, match->data);
839 }
840 }
841
842
843 static void
844 set_op_device_list(remote_fencing_op_t * op, GList *devices)
845 {
846 GList *lpc = NULL;
847
848 if (op->devices_list) {
849 g_list_free_full(op->devices_list, free);
850 op->devices_list = NULL;
851 }
852 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
853 const char *device = lpc->data;
854
855 op->devices_list = g_list_append(op->devices_list,
856 pcmk__str_copy(device));
857 }
858 op->devices = op->devices_list;
859 }
860
861
862
863
864
865
866
867
868
869
870 static gboolean
871 topology_matches(const stonith_topology_t *tp, const char *node)
872 {
873 regex_t r_patt;
874
875 CRM_CHECK(node && tp && tp->target, return FALSE);
876 switch (tp->kind) {
877 case fenced_target_by_attribute:
878
879
880
881
882
883
884 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
885 crm_notice("Matched %s with %s by attribute", node, tp->target);
886 return TRUE;
887 }
888 break;
889
890 case fenced_target_by_pattern:
891
892
893
894 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
895 crm_info("Bad regex '%s' for fencing level", tp->target);
896 } else {
897 int status = regexec(&r_patt, node, 0, NULL, 0);
898
899 regfree(&r_patt);
900 if (status == 0) {
901 crm_notice("Matched %s with %s by name", node, tp->target);
902 return TRUE;
903 }
904 }
905 break;
906
907 case fenced_target_by_name:
908 crm_trace("Testing %s against %s", node, tp->target);
909 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
910
911 default:
912 break;
913 }
914 crm_trace("No match for %s with %s", node, tp->target);
915 return FALSE;
916 }
917
918 stonith_topology_t *
919 find_topology_for_host(const char *host)
920 {
921 GHashTableIter tIter;
922 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
923
924 if(tp != NULL) {
925 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
926 return tp;
927 }
928
929 g_hash_table_iter_init(&tIter, topology);
930 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
931 if (topology_matches(tp, host)) {
932 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
933 return tp;
934 }
935 }
936
937 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
938 return NULL;
939 }
940
941
942
943
944
945
946
947
948
949
950
951
952 static int
953 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
954 {
955 stonith_topology_t *tp = NULL;
956
957 if (op->target) {
958 tp = find_topology_for_host(op->target);
959 }
960 if (topology_is_empty(tp)) {
961 return empty_ok? pcmk_rc_ok : ENODEV;
962 }
963
964 pcmk__assert(tp->levels != NULL);
965
966 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
967
968
969 undo_op_remap(op);
970
971 do {
972 op->level++;
973
974 } while (op->level < ST__LEVEL_COUNT && tp->levels[op->level] == NULL);
975
976 if (op->level < ST__LEVEL_COUNT) {
977 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
978 "for client %s@%s (id=%.8s)",
979 op->level, op->target, g_list_length(tp->levels[op->level]),
980 op->client_name, op->originator, op->id);
981 set_op_device_list(op, tp->levels[op->level]);
982
983
984 if ((op->level > 1) && (op->client_delay > 0)) {
985 op->client_delay = 0;
986 }
987
988 if ((g_list_next(op->devices_list) != NULL)
989 && pcmk__str_eq(op->action, PCMK_ACTION_REBOOT, pcmk__str_none)) {
990
991
992
993
994
995 op_phase_off(op);
996 }
997 return pcmk_rc_ok;
998 }
999
1000 crm_info("All %sfencing options targeting %s for client %s@%s failed "
1001 QB_XS " id=%.8s",
1002 (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
1003 op->target, op->client_name, op->originator, op->id);
1004 return ENODEV;
1005 }
1006
1007
1008
1009
1010
1011
1012
1013 static void
1014 merge_duplicates(remote_fencing_op_t *op)
1015 {
1016 GHashTableIter iter;
1017 remote_fencing_op_t *other = NULL;
1018
1019 time_t now = time(NULL);
1020
1021 g_hash_table_iter_init(&iter, stonith_remote_op_list);
1022 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
1023 const char *other_action = op_requested_action(other);
1024 pcmk__node_status_t *node = NULL;
1025
1026 if (!strcmp(op->id, other->id)) {
1027 continue;
1028 }
1029 if (other->state > st_exec) {
1030 crm_trace("%.8s not duplicate of %.8s: not in progress",
1031 op->id, other->id);
1032 continue;
1033 }
1034 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1035 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1036 op->id, other->id, op->target, other->target);
1037 continue;
1038 }
1039 if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1040 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1041 op->id, other->id, op->action, other_action);
1042 continue;
1043 }
1044 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1045 crm_trace("%.8s not duplicate of %.8s: same client %s",
1046 op->id, other->id, op->client_name);
1047 continue;
1048 }
1049 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1050 crm_trace("%.8s not duplicate of %.8s: self-fencing for %s",
1051 op->id, other->id, other->target);
1052 continue;
1053 }
1054
1055 node = pcmk__get_node(0, other->originator, NULL,
1056 pcmk__node_search_cluster_member);
1057
1058 if (!fencing_peer_active(node)) {
1059 crm_notice("Failing action '%s' targeting %s originating from "
1060 "client %s@%s: Originator is dead " QB_XS " id=%.8s",
1061 other->action, other->target, other->client_name,
1062 other->originator, other->id);
1063 crm_trace("%.8s not duplicate of %.8s: originator dead",
1064 op->id, other->id);
1065 other->state = st_failed;
1066 continue;
1067 }
1068 if ((other->total_timeout > 0)
1069 && (now > (other->total_timeout + other->created))) {
1070 crm_trace("%.8s not duplicate of %.8s: old (%lld vs. %lld + %ds)",
1071 op->id, other->id, (long long)now, (long long)other->created,
1072 other->total_timeout);
1073 continue;
1074 }
1075
1076
1077
1078
1079 other->duplicates = g_list_append(other->duplicates, op);
1080 if (other->total_timeout == 0) {
1081 other->total_timeout = op->total_timeout =
1082 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1083 crm_trace("Best guess as to timeout used for %.8s: %ds",
1084 other->id, other->total_timeout);
1085 }
1086 crm_notice("Merging fencing action '%s' targeting %s originating from "
1087 "client %s with identical request from %s@%s "
1088 QB_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1089 op->action, op->target, op->client_name,
1090 other->client_name, other->originator,
1091 op->id, other->id, other->total_timeout);
1092 report_timeout_period(op, other->total_timeout);
1093 op->state = st_duplicate;
1094 }
1095 }
1096
1097 static uint32_t fencing_active_peers(void)
1098 {
1099 uint32_t count = 0;
1100 pcmk__node_status_t *entry = NULL;
1101 GHashTableIter gIter;
1102
1103 g_hash_table_iter_init(&gIter, pcmk__peer_cache);
1104 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1105 if(fencing_peer_active(entry)) {
1106 count++;
1107 }
1108 }
1109 return count;
1110 }
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121 int
1122 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
1123 {
1124 remote_fencing_op_t *op = NULL;
1125 xmlNode *dev = pcmk__xpath_find_one(msg->doc,
1126 "//*[@" PCMK__XA_ST_TARGET "]",
1127 LOG_ERR);
1128
1129 CRM_CHECK(dev != NULL, return EPROTO);
1130
1131 crm_notice("Received manual confirmation that %s has been fenced",
1132 pcmk__s(crm_element_value(dev, PCMK__XA_ST_TARGET),
1133 "unknown target"));
1134 op = initiate_remote_stonith_op(client, msg, TRUE);
1135 if (op == NULL) {
1136 return EPROTO;
1137 }
1138 op->state = st_done;
1139 op->delegate = pcmk__str_copy("a human");
1140
1141
1142 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1143 finalize_op(op, msg, false);
1144
1145
1146
1147
1148 return EINPROGRESS;
1149 }
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162 void *
1163 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
1164 {
1165 remote_fencing_op_t *op = NULL;
1166 xmlNode *dev = pcmk__xpath_find_one(request->doc,
1167 "//*[@" PCMK__XA_ST_TARGET "]",
1168 LOG_NEVER);
1169 int rc = pcmk_rc_ok;
1170 const char *operation = NULL;
1171
1172 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1173
1174
1175
1176 if (peer && dev) {
1177 const char *op_id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
1178
1179 CRM_CHECK(op_id != NULL, return NULL);
1180
1181 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1182 if (op) {
1183 crm_debug("Reusing existing remote fencing op %.8s for %s",
1184 op_id, ((client == NULL)? "unknown client" : client));
1185 return op;
1186 }
1187 }
1188
1189 op = pcmk__assert_alloc(1, sizeof(remote_fencing_op_t));
1190
1191 crm_element_value_int(request, PCMK__XA_ST_TIMEOUT, &(op->base_timeout));
1192
1193 crm_element_value_int(request, PCMK__XA_ST_DELAY, &(op->client_delay));
1194
1195 if (peer && dev) {
1196 op->id = crm_element_value_copy(dev, PCMK__XA_ST_REMOTE_OP);
1197 } else {
1198 op->id = crm_generate_uuid();
1199 }
1200
1201 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1202
1203 op->state = st_query;
1204 op->replies_expected = fencing_active_peers();
1205 op->action = crm_element_value_copy(dev, PCMK__XA_ST_DEVICE_ACTION);
1206
1207
1208
1209
1210
1211
1212
1213 op->originator = crm_element_value_copy(dev, PCMK__XA_ST_ORIGIN);
1214 if (op->originator == NULL) {
1215
1216 op->originator = pcmk__str_copy(fenced_get_local_node());
1217 }
1218
1219
1220 op->delegate = crm_element_value_copy(dev, PCMK__XA_ST_DELEGATE);
1221 op->created = time(NULL);
1222
1223 CRM_LOG_ASSERT(client != NULL);
1224 op->client_id = pcmk__str_copy(client);
1225
1226
1227 operation = crm_element_value(request, PCMK__XA_ST_OP);
1228
1229 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1230 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1231 (unsigned long) getpid());
1232 } else {
1233 op->client_name = crm_element_value_copy(request,
1234 PCMK__XA_ST_CLIENTNAME);
1235 }
1236
1237 op->target = crm_element_value_copy(dev, PCMK__XA_ST_TARGET);
1238
1239
1240 op->request = pcmk__xml_copy(NULL, request);
1241
1242 rc = pcmk__xe_get_flags(request, PCMK__XA_ST_CALLOPT, &(op->call_options),
1243 0U);
1244 if (rc != pcmk_rc_ok) {
1245 crm_warn("Couldn't parse options from request %s: %s",
1246 op->id, pcmk_rc_str(rc));
1247 }
1248
1249 crm_element_value_int(request, PCMK__XA_ST_CALLID, &(op->client_callid));
1250
1251 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1252 "base timeout %ds, %u %s expected)",
1253 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1254 op->target, op->client_name, op->base_timeout,
1255 op->replies_expected,
1256 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1257
1258 if (op->call_options & st_opt_cs_nodeid) {
1259 int nodeid;
1260 pcmk__node_status_t *node = NULL;
1261
1262 pcmk__scan_min_int(op->target, &nodeid, 0);
1263 node = pcmk__search_node_caches(nodeid, NULL, NULL,
1264 pcmk__node_search_any
1265 |pcmk__node_search_cluster_cib);
1266
1267
1268 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1269
1270 if ((node != NULL) && (node->name != NULL)) {
1271 pcmk__str_update(&(op->target), node->name);
1272
1273 } else {
1274 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1275 }
1276 }
1277
1278
1279 merge_duplicates(op);
1280
1281 if (op->state != st_duplicate) {
1282
1283 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
1284 }
1285
1286
1287 stonith_fence_history_trim();
1288
1289 return op;
1290 }
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302 remote_fencing_op_t *
1303 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
1304 gboolean manual_ack)
1305 {
1306 int query_timeout = 0;
1307 xmlNode *query = NULL;
1308 const char *client_id = NULL;
1309 remote_fencing_op_t *op = NULL;
1310 const char *relay_op_id = NULL;
1311 const char *operation = NULL;
1312
1313 if (client) {
1314 client_id = client->id;
1315 } else {
1316 client_id = crm_element_value(request, PCMK__XA_ST_CLIENTID);
1317 }
1318
1319 CRM_LOG_ASSERT(client_id != NULL);
1320 op = create_remote_stonith_op(client_id, request, FALSE);
1321 op->owner = TRUE;
1322 if (manual_ack) {
1323 return op;
1324 }
1325
1326 CRM_CHECK(op->action, return NULL);
1327
1328 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1329 op->state = st_failed;
1330 }
1331
1332 switch (op->state) {
1333 case st_failed:
1334
1335 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1336 "All topology levels failed");
1337 crm_warn("Could not request peer fencing (%s) targeting %s "
1338 QB_XS " id=%.8s", op->action, op->target, op->id);
1339 finalize_op(op, NULL, false);
1340 return op;
1341
1342 case st_duplicate:
1343 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1344 QB_XS " id=%.8s", op->action, op->target, op->id);
1345 return op;
1346
1347 default:
1348 crm_notice("Requesting peer fencing (%s) targeting %s "
1349 QB_XS " id=%.8s state=%s base_timeout=%ds",
1350 op->action, op->target, op->id,
1351 stonith__op_state_text(op->state), op->base_timeout);
1352 }
1353
1354 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1355 NULL, op->call_options);
1356
1357 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP, op->id);
1358 crm_xml_add(query, PCMK__XA_ST_TARGET, op->target);
1359 crm_xml_add(query, PCMK__XA_ST_DEVICE_ACTION, op_requested_action(op));
1360 crm_xml_add(query, PCMK__XA_ST_ORIGIN, op->originator);
1361 crm_xml_add(query, PCMK__XA_ST_CLIENTID, op->client_id);
1362 crm_xml_add(query, PCMK__XA_ST_CLIENTNAME, op->client_name);
1363 crm_xml_add_int(query, PCMK__XA_ST_TIMEOUT, op->base_timeout);
1364
1365
1366 operation = crm_element_value(request, PCMK__XA_ST_OP);
1367 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1368 relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP);
1369 if (relay_op_id) {
1370 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP_RELAY, relay_op_id);
1371 }
1372 }
1373
1374 pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, query);
1375 pcmk__xml_free(query);
1376
1377 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1378 op->query_timer = pcmk__create_timer((1000 * query_timeout), remote_op_query_timeout, op);
1379
1380 return op;
1381 }
1382
1383 enum find_best_peer_options {
1384
1385 FIND_PEER_SKIP_TARGET = 0x0001,
1386
1387 FIND_PEER_TARGET_ONLY = 0x0002,
1388
1389 FIND_PEER_VERIFIED_ONLY = 0x0004,
1390 };
1391
1392 static bool
1393 is_watchdog_fencing(const remote_fencing_op_t *op, const char *device)
1394 {
1395 return (stonith_watchdog_timeout_ms > 0
1396
1397 && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_null_matches)
1398 && pcmk__is_fencing_action(op->action)
1399 && node_does_watchdog_fencing(op->target));
1400 }
1401
1402 static peer_device_info_t *
1403 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1404 {
1405 GList *iter = NULL;
1406 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1407
1408 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1409 return NULL;
1410 }
1411
1412 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1413 peer_device_info_t *peer = iter->data;
1414
1415 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1416 peer->host, op->target, peer->ndevices,
1417 pcmk__plural_s(peer->ndevices), peer->tried, options);
1418 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1419 continue;
1420 }
1421 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1422 continue;
1423 }
1424
1425 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1426
1427 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1428 return peer;
1429 }
1430
1431 } else if (!peer->tried
1432 && count_peer_devices(op, peer, verified_devices_only,
1433 fenced_support_flag(op->action))) {
1434
1435 crm_trace("Simple fencing");
1436 return peer;
1437 }
1438 }
1439
1440 return NULL;
1441 }
1442
1443 static peer_device_info_t *
1444 stonith_choose_peer(remote_fencing_op_t * op)
1445 {
1446 const char *device = NULL;
1447 peer_device_info_t *peer = NULL;
1448 uint32_t active = fencing_active_peers();
1449
1450 do {
1451 if (op->devices) {
1452 device = op->devices->data;
1453 crm_trace("Checking for someone to fence (%s) %s using %s",
1454 op->action, op->target, device);
1455 } else {
1456 crm_trace("Checking for someone to fence (%s) %s",
1457 op->action, op->target);
1458 }
1459
1460
1461 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1462 if (peer) {
1463 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1464 return peer;
1465 }
1466
1467 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1468 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1469 return NULL;
1470 }
1471
1472
1473 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1474 if (peer) {
1475 crm_trace("Found best unverified peer %s", peer->host);
1476 return peer;
1477 }
1478
1479
1480
1481
1482 if (op->phase != st_phase_on) {
1483 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1484 if (peer) {
1485 crm_trace("%s will fence itself", peer->host);
1486 return peer;
1487 }
1488 }
1489
1490
1491
1492
1493 } while ((op->phase != st_phase_on)
1494 && pcmk_is_set(op->call_options, st_opt_topology)
1495 && (advance_topology_level(op, false) == pcmk_rc_ok));
1496
1497
1498
1499
1500 if (is_watchdog_fencing(op, device)) {
1501 crm_info("Couldn't contact watchdog-fencing target-node (%s)",
1502 op->target);
1503
1504 } else {
1505 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1506 op->action, op->target, (device? device : "any device"));
1507 }
1508 return NULL;
1509 }
1510
1511 static int
1512 valid_fencing_timeout(int specified_timeout, bool action_specific,
1513 const remote_fencing_op_t *op, const char *device)
1514 {
1515 int timeout = specified_timeout;
1516
1517 if (!is_watchdog_fencing(op, device)) {
1518 return timeout;
1519 }
1520
1521 timeout = (int) QB_MIN(QB_MAX(specified_timeout,
1522 pcmk__timeout_ms2s(stonith_watchdog_timeout_ms)),
1523 INT_MAX);
1524
1525 if (timeout > specified_timeout) {
1526 if (action_specific) {
1527 crm_warn("pcmk_%s_timeout %ds for %s is too short (must be >= "
1528 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1529 "instead",
1530 op->action, specified_timeout, device? device : "watchdog",
1531 timeout, timeout);
1532
1533 } else {
1534 crm_warn("Fencing timeout %ds is too short (must be >= "
1535 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1536 "instead",
1537 specified_timeout, timeout, timeout);
1538 }
1539 }
1540
1541 return timeout;
1542 }
1543
1544 static int
1545 get_device_timeout(const remote_fencing_op_t *op,
1546 const peer_device_info_t *peer, const char *device,
1547 bool with_delay)
1548 {
1549 int timeout = valid_fencing_timeout(op->base_timeout, false, op, device);
1550 device_properties_t *props;
1551
1552 if (!peer || !device) {
1553 return timeout;
1554 }
1555
1556 props = g_hash_table_lookup(peer->devices, device);
1557 if (!props) {
1558 return timeout;
1559 }
1560
1561 if (props->custom_action_timeout[op->phase]) {
1562 timeout = valid_fencing_timeout(props->custom_action_timeout[op->phase],
1563 true, op, device);
1564 }
1565
1566
1567 if (with_delay && (op->client_delay >= 0)) {
1568
1569 timeout += (props->delay_max[op->phase] > 0 ?
1570 props->delay_max[op->phase] : props->delay_base[op->phase]);
1571 }
1572
1573 return timeout;
1574 }
1575
1576 struct timeout_data {
1577 const remote_fencing_op_t *op;
1578 const peer_device_info_t *peer;
1579 int total_timeout;
1580 };
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590 static void
1591 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1592 {
1593 const char *device_id = key;
1594 device_properties_t *props = value;
1595 struct timeout_data *timeout = user_data;
1596
1597 if (!props->executed[timeout->op->phase]
1598 && !props->disallowed[timeout->op->phase]) {
1599 timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer,
1600 device_id, true);
1601 }
1602 }
1603
1604 static int
1605 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
1606 {
1607 struct timeout_data timeout;
1608
1609 timeout.op = op;
1610 timeout.peer = peer;
1611 timeout.total_timeout = 0;
1612
1613 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1614
1615 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1616 }
1617
1618 static int
1619 get_op_total_timeout(const remote_fencing_op_t *op,
1620 const peer_device_info_t *chosen_peer)
1621 {
1622 long long total_timeout = 0;
1623 stonith_topology_t *tp = find_topology_for_host(op->target);
1624
1625 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1626 int i;
1627 GList *device_list = NULL;
1628 GList *iter = NULL;
1629 GList *auto_list = NULL;
1630
1631 if (pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)
1632 && (op->automatic_list != NULL)) {
1633 auto_list = g_list_copy(op->automatic_list);
1634 }
1635
1636
1637
1638
1639
1640
1641
1642
1643 for (i = 0; i < ST__LEVEL_COUNT; i++) {
1644 if (!tp->levels[i]) {
1645 continue;
1646 }
1647 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1648 bool found = false;
1649
1650 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1651 const peer_device_info_t *peer = iter->data;
1652
1653 if (auto_list) {
1654 GList *match = g_list_find_custom(auto_list, device_list->data,
1655 sort_strings);
1656 if (match) {
1657 auto_list = g_list_remove(auto_list, match->data);
1658 }
1659 }
1660
1661 if (find_peer_device(op, peer, device_list->data,
1662 fenced_support_flag(op->action))) {
1663 total_timeout += get_device_timeout(op, peer,
1664 device_list->data,
1665 true);
1666 found = true;
1667 break;
1668 }
1669 }
1670
1671
1672
1673
1674 if (!found && is_watchdog_fencing(op, device_list->data)) {
1675 total_timeout += pcmk__timeout_ms2s(stonith_watchdog_timeout_ms);
1676 }
1677 }
1678 }
1679
1680
1681 if (auto_list) {
1682 for (iter = auto_list; iter != NULL; iter = iter->next) {
1683 GList *iter2 = NULL;
1684
1685 for (iter2 = op->query_results; iter2 != NULL; iter2 = iter2->next) {
1686 peer_device_info_t *peer = iter2->data;
1687 if (find_peer_device(op, peer, iter->data,
1688 fenced_df_supports_on)) {
1689 total_timeout += get_device_timeout(op, peer,
1690 iter->data, true);
1691 break;
1692 }
1693 }
1694 }
1695 }
1696
1697 g_list_free(auto_list);
1698
1699 } else if (chosen_peer) {
1700 total_timeout = get_peer_timeout(op, chosen_peer);
1701
1702 } else {
1703 total_timeout = valid_fencing_timeout(op->base_timeout, false, op,
1704 NULL);
1705 }
1706
1707 if (total_timeout <= 0) {
1708 total_timeout = op->base_timeout;
1709 }
1710
1711
1712
1713
1714 if (op->client_delay > 0) {
1715 total_timeout += op->client_delay;
1716 }
1717 return (int) QB_MIN(total_timeout, INT_MAX);
1718 }
1719
1720 static void
1721 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1722 {
1723 GList *iter = NULL;
1724 xmlNode *update = NULL;
1725 const char *client_node = NULL;
1726 const char *client_id = NULL;
1727 const char *call_id = NULL;
1728
1729 if (op->call_options & st_opt_sync_call) {
1730
1731
1732
1733
1734 return;
1735 } else if (!op->request) {
1736 return;
1737 }
1738
1739 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1740 client_node = crm_element_value(op->request, PCMK__XA_ST_CLIENTNODE);
1741 call_id = crm_element_value(op->request, PCMK__XA_ST_CALLID);
1742 client_id = crm_element_value(op->request, PCMK__XA_ST_CLIENTID);
1743 if (!client_node || !call_id || !client_id) {
1744 return;
1745 }
1746
1747 if (pcmk__str_eq(client_node, fenced_get_local_node(), pcmk__str_casei)) {
1748
1749 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1750 return;
1751 }
1752
1753
1754 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1755 crm_xml_add(update, PCMK__XA_ST_REMOTE_OP, op->id);
1756 crm_xml_add(update, PCMK__XA_ST_CLIENTID, client_id);
1757 crm_xml_add(update, PCMK__XA_ST_CALLID, call_id);
1758 crm_xml_add_int(update, PCMK__XA_ST_TIMEOUT, op_timeout);
1759
1760 pcmk__cluster_send_message(pcmk__get_node(0, client_node, NULL,
1761 pcmk__node_search_cluster_member),
1762 pcmk_ipc_fenced, update);
1763
1764 pcmk__xml_free(update);
1765
1766 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1767 remote_fencing_op_t *dup = iter->data;
1768
1769 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1770 dup->id, dup->client_name);
1771 report_timeout_period(iter->data, op_timeout);
1772 }
1773 }
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783 static void
1784 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1785 xmlNode *msg)
1786 {
1787
1788 if (op->devices) {
1789 op->devices = op->devices->next;
1790 }
1791
1792
1793 if ((op->phase == st_phase_requested)
1794 && pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)) {
1795
1796 remove_required_device(op, device);
1797
1798
1799
1800
1801 if (op->devices == NULL) {
1802 op->devices = op->automatic_list;
1803 }
1804 }
1805
1806 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1807
1808
1809
1810
1811 op_phase_on(op);
1812 }
1813
1814
1815 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1816
1817 if (op->devices) {
1818
1819 crm_trace("Next targeting %s on behalf of %s@%s",
1820 op->target, op->client_name, op->originator);
1821
1822
1823 if (op->client_delay > 0) {
1824 op->client_delay = 0;
1825 }
1826
1827 request_peer_fencing(op, NULL);
1828 } else {
1829
1830 crm_trace("Marking complex fencing op targeting %s as complete",
1831 op->target);
1832 op->state = st_done;
1833 finalize_op(op, msg, false);
1834 }
1835 }
1836
1837 static gboolean
1838 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
1839 {
1840 if (node_does_watchdog_fencing(op->target)) {
1841 guint timeout_ms = QB_MIN(stonith_watchdog_timeout_ms, UINT_MAX);
1842
1843 crm_notice("Waiting %s for %s to self-fence (%s) for "
1844 "client %s " QB_XS " id=%.8s",
1845 pcmk__readable_interval(timeout_ms), op->target, op->action,
1846 op->client_name, op->id);
1847
1848 if (op->op_timer_one) {
1849 g_source_remove(op->op_timer_one);
1850 }
1851 op->op_timer_one = pcmk__create_timer(timeout_ms, remote_op_watchdog_done,
1852 op);
1853 return TRUE;
1854 } else {
1855 crm_debug("Skipping fallback to watchdog-fencing as %s is "
1856 "not in host-list", op->target);
1857 }
1858 return FALSE;
1859 }
1860
1861
1862
1863
1864
1865
1866
1867
1868
1869 static void
1870 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
1871 {
1872 const char *device = NULL;
1873 int timeout;
1874
1875 CRM_CHECK(op != NULL, return);
1876
1877 crm_trace("Action %.8s targeting %s for %s is %s",
1878 op->id, op->target, op->client_name,
1879 stonith__op_state_text(op->state));
1880
1881 if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1882
1883
1884
1885
1886
1887
1888
1889
1890
1891
1892
1893 device = op->devices->data;
1894 if (pcmk__str_eq(fenced_device_reboot_action(device), PCMK_ACTION_OFF,
1895 pcmk__str_none)) {
1896 crm_info("Not turning %s back on using %s because the device is "
1897 "configured to stay off (pcmk_reboot_action='off')",
1898 op->target, device);
1899 advance_topology_device_in_level(op, device, NULL);
1900 return;
1901 }
1902 if (!fenced_device_supports_on(device)) {
1903 crm_info("Not turning %s back on using %s because the agent "
1904 "doesn't support 'on'", op->target, device);
1905 advance_topology_device_in_level(op, device, NULL);
1906 return;
1907 }
1908 }
1909
1910 timeout = op->base_timeout;
1911 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1912 peer = stonith_choose_peer(op);
1913 }
1914
1915 if (!op->op_timer_total) {
1916 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1917 op->op_timer_total = pcmk__create_timer(1000 * op->total_timeout, remote_op_timeout, op);
1918 report_timeout_period(op, op->total_timeout);
1919 crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s "
1920 QB_XS " id=%.8s",
1921 op->total_timeout, op->target, op->client_name, op->id);
1922 }
1923
1924 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1925
1926
1927
1928
1929
1930
1931
1932
1933 peer = stonith_choose_peer(op);
1934
1935 device = op->devices->data;
1936
1937
1938
1939
1940 timeout = get_device_timeout(op, peer, device, false);
1941 }
1942
1943 if (peer) {
1944 int timeout_one = 0;
1945 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1946 const pcmk__node_status_t *peer_node =
1947 pcmk__get_node(0, peer->host, NULL,
1948 pcmk__node_search_cluster_member);
1949
1950 if (op->client_delay > 0) {
1951
1952
1953
1954 timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay;
1955 }
1956
1957 crm_xml_add(remote_op, PCMK__XA_ST_REMOTE_OP, op->id);
1958 crm_xml_add(remote_op, PCMK__XA_ST_TARGET, op->target);
1959 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ACTION, op->action);
1960 crm_xml_add(remote_op, PCMK__XA_ST_ORIGIN, op->originator);
1961 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTID, op->client_id);
1962 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTNAME, op->client_name);
1963 crm_xml_add_int(remote_op, PCMK__XA_ST_TIMEOUT, timeout);
1964 crm_xml_add_int(remote_op, PCMK__XA_ST_CALLOPT, op->call_options);
1965 crm_xml_add_int(remote_op, PCMK__XA_ST_DELAY, op->client_delay);
1966
1967 if (device) {
1968 timeout_one += TIMEOUT_MULTIPLY_FACTOR *
1969 get_device_timeout(op, peer, device, true);
1970 crm_notice("Requesting that %s perform '%s' action targeting %s "
1971 "using %s " QB_XS " for client %s (%ds)",
1972 peer->host, op->action, op->target, device,
1973 op->client_name, timeout_one);
1974 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ID, device);
1975
1976 } else {
1977 timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1978 crm_notice("Requesting that %s perform '%s' action targeting %s "
1979 QB_XS " for client %s (%ds, %s)",
1980 peer->host, op->action, op->target, op->client_name,
1981 timeout_one,
1982 pcmk__readable_interval(stonith_watchdog_timeout_ms));
1983 }
1984
1985 op->state = st_exec;
1986 if (op->op_timer_one) {
1987 g_source_remove(op->op_timer_one);
1988 op->op_timer_one = 0;
1989 }
1990
1991 if (!is_watchdog_fencing(op, device)
1992 || !check_watchdog_fencing_and_wait(op)) {
1993
1994
1995
1996
1997
1998
1999
2000
2001
2002
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014 op->op_timer_one = pcmk__create_timer((1000 * timeout_one), remote_op_timeout_one, op);
2015 }
2016
2017 pcmk__cluster_send_message(peer_node, pcmk_ipc_fenced, remote_op);
2018 peer->tried = TRUE;
2019 pcmk__xml_free(remote_op);
2020 return;
2021
2022 } else if (op->phase == st_phase_on) {
2023
2024
2025
2026 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
2027 "after successful 'off'", device, op->target);
2028 advance_topology_device_in_level(op, device, NULL);
2029 return;
2030
2031 } else if (op->owner == FALSE) {
2032 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
2033 op->action, op->target, op->client_name);
2034
2035 } else if (op->query_timer == 0) {
2036
2037 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
2038 QB_XS " state=%s", op->action, op->target, op->client_name,
2039 stonith__op_state_text(op->state));
2040 CRM_CHECK(op->state < st_done, return);
2041 finalize_timed_out_op(op, "All nodes failed, or are unable, to "
2042 "fence target");
2043
2044 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
2045
2046
2047
2048
2049 if (is_watchdog_fencing(op, device)
2050 && check_watchdog_fencing_and_wait(op)) {
2051
2052
2053
2054
2055
2056 op->state = st_exec;
2057 return;
2058 }
2059
2060 if (op->state == st_query) {
2061 crm_info("No peers (out of %d) have devices capable of fencing "
2062 "(%s) %s for client %s " QB_XS " state=%s",
2063 op->replies, op->action, op->target, op->client_name,
2064 stonith__op_state_text(op->state));
2065
2066 pcmk__reset_result(&op->result);
2067 pcmk__set_result(&op->result, CRM_EX_ERROR,
2068 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2069 } else {
2070 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2071 pcmk__reset_result(&op->result);
2072 pcmk__set_result(&op->result, CRM_EX_ERROR,
2073 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2074 }
2075
2076
2077
2078
2079
2080
2081
2082 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
2083 "for client %s " QB_XS " state=%s",
2084 op->replies, op->action, op->target, op->client_name,
2085 stonith__op_state_text(op->state));
2086 }
2087
2088 op->state = st_failed;
2089 finalize_op(op, NULL, false);
2090
2091 } else {
2092 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
2093 "for client %s " QB_XS " id=%.8s",
2094 op->action, op->target, (device? " using " : ""),
2095 (device? device : ""), op->client_name, op->id);
2096 }
2097 }
2098
2099
2100
2101
2102
2103
2104
2105
2106
2107
2108
2109
2110 static gint
2111 sort_peers(gconstpointer a, gconstpointer b)
2112 {
2113 const peer_device_info_t *peer_a = a;
2114 const peer_device_info_t *peer_b = b;
2115
2116 return (peer_b->ndevices - peer_a->ndevices);
2117 }
2118
2119
2120
2121
2122
2123
2124
2125 static gboolean
2126 all_topology_devices_found(const remote_fencing_op_t *op)
2127 {
2128 GList *device = NULL;
2129 GList *iter = NULL;
2130 device_properties_t *match = NULL;
2131 stonith_topology_t *tp = NULL;
2132 gboolean skip_target = FALSE;
2133 int i;
2134
2135 tp = find_topology_for_host(op->target);
2136 if (!tp) {
2137 return FALSE;
2138 }
2139 if (pcmk__is_fencing_action(op->action)) {
2140
2141
2142 skip_target = TRUE;
2143 }
2144
2145 for (i = 0; i < ST__LEVEL_COUNT; i++) {
2146 for (device = tp->levels[i]; device; device = device->next) {
2147 match = NULL;
2148 for (iter = op->query_results; iter && !match; iter = iter->next) {
2149 peer_device_info_t *peer = iter->data;
2150
2151 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
2152 continue;
2153 }
2154 match = find_peer_device(op, peer, device->data,
2155 fenced_df_none);
2156 }
2157 if (!match) {
2158 return FALSE;
2159 }
2160 }
2161 }
2162
2163 return TRUE;
2164 }
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
2176
2177
2178 static void
2179 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
2180 const char *action, remote_fencing_op_t *op,
2181 enum st_remap_phase phase, device_properties_t *props)
2182 {
2183 props->custom_action_timeout[phase] = 0;
2184 crm_element_value_int(xml, PCMK__XA_ST_ACTION_TIMEOUT,
2185 &props->custom_action_timeout[phase]);
2186 if (props->custom_action_timeout[phase]) {
2187 crm_trace("Peer %s with device %s returned %s action timeout %ds",
2188 peer, device, action, props->custom_action_timeout[phase]);
2189 }
2190
2191 props->delay_max[phase] = 0;
2192 crm_element_value_int(xml, PCMK__XA_ST_DELAY_MAX, &props->delay_max[phase]);
2193 if (props->delay_max[phase]) {
2194 crm_trace("Peer %s with device %s returned maximum of random delay %ds for %s",
2195 peer, device, props->delay_max[phase], action);
2196 }
2197
2198 props->delay_base[phase] = 0;
2199 crm_element_value_int(xml, PCMK__XA_ST_DELAY_BASE,
2200 &props->delay_base[phase]);
2201 if (props->delay_base[phase]) {
2202 crm_trace("Peer %s with device %s returned base delay %ds for %s",
2203 peer, device, props->delay_base[phase], action);
2204 }
2205
2206
2207 if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
2208 int required = 0;
2209
2210 crm_element_value_int(xml, PCMK__XA_ST_REQUIRED, &required);
2211 if (required) {
2212 crm_trace("Peer %s requires device %s to execute for action %s",
2213 peer, device, action);
2214 add_required_device(op, device);
2215 }
2216 }
2217
2218
2219
2220
2221 if (pcmk__xe_attr_is_true(xml, PCMK__XA_ST_ACTION_DISALLOWED)) {
2222 props->disallowed[phase] = TRUE;
2223 crm_trace("Peer %s is disallowed from executing %s for device %s",
2224 peer, action, device);
2225 }
2226 }
2227
2228
2229
2230
2231
2232
2233
2234
2235
2236
2237 static void
2238 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
2239 peer_device_info_t *peer, const char *device)
2240 {
2241 xmlNode *child;
2242 int verified = 0;
2243 device_properties_t *props =
2244 pcmk__assert_alloc(1, sizeof(device_properties_t));
2245 int rc = pcmk_rc_ok;
2246
2247
2248 g_hash_table_insert(peer->devices, pcmk__str_copy(device), props);
2249
2250
2251 crm_element_value_int(xml, PCMK__XA_ST_MONITOR_VERIFIED, &verified);
2252 if (verified) {
2253 crm_trace("Peer %s has confirmed a verified device %s",
2254 peer->host, device);
2255 props->verified = TRUE;
2256 }
2257
2258
2259 rc = pcmk__xe_get_flags(xml, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS,
2260 &(props->device_support_flags),
2261 fenced_df_supports_on);
2262 if (rc != pcmk_rc_ok) {
2263 crm_warn("Couldn't determine device support for %s "
2264 "(assuming unfencing): %s", device, pcmk_rc_str(rc));
2265 }
2266
2267
2268 parse_action_specific(xml, peer->host, device, op_requested_action(op),
2269 op, st_phase_requested, props);
2270 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2271 child = pcmk__xe_next(child, NULL)) {
2272
2273
2274
2275
2276 if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_OFF, pcmk__str_none)) {
2277 parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF,
2278 op, st_phase_off, props);
2279
2280 } else if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_ON,
2281 pcmk__str_none)) {
2282 parse_action_specific(child, peer->host, device, PCMK_ACTION_ON,
2283 op, st_phase_on, props);
2284 }
2285 }
2286 }
2287
2288
2289
2290
2291
2292
2293
2294
2295
2296
2297
2298
2299 static peer_device_info_t *
2300 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
2301 const xmlNode *xml)
2302 {
2303 peer_device_info_t *peer = pcmk__assert_alloc(1,
2304 sizeof(peer_device_info_t));
2305 xmlNode *child;
2306
2307 peer->host = pcmk__str_copy(host);
2308 peer->devices = pcmk__strkey_table(free, free);
2309
2310
2311 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2312 child = pcmk__xe_next(child, NULL)) {
2313 const char *device = pcmk__xe_id(child);
2314
2315 if (device) {
2316 add_device_properties(child, op, peer, device);
2317 }
2318 }
2319
2320 peer->ndevices = g_hash_table_size(peer->devices);
2321 CRM_CHECK(ndevices == peer->ndevices,
2322 crm_err("Query claimed to have %d device%s but %d found",
2323 ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2324
2325 op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2326 return peer;
2327 }
2328
2329
2330
2331
2332
2333
2334
2335
2336
2337
2338
2339
2340
2341
2342
2343 int
2344 process_remote_stonith_query(xmlNode *msg)
2345 {
2346 int ndevices = 0;
2347 gboolean host_is_target = FALSE;
2348 gboolean have_all_replies = FALSE;
2349 const char *id = NULL;
2350 const char *host = NULL;
2351 remote_fencing_op_t *op = NULL;
2352 peer_device_info_t *peer = NULL;
2353 uint32_t replies_expected;
2354 xmlNode *dev = pcmk__xpath_find_one(msg->doc,
2355 "//*[@" PCMK__XA_ST_REMOTE_OP "]",
2356 LOG_ERR);
2357
2358 CRM_CHECK(dev != NULL, return -EPROTO);
2359
2360 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2361 CRM_CHECK(id != NULL, return -EPROTO);
2362
2363 dev = pcmk__xpath_find_one(msg->doc,
2364 "//*[@" PCMK__XA_ST_AVAILABLE_DEVICES "]",
2365 LOG_ERR);
2366 CRM_CHECK(dev != NULL, return -EPROTO);
2367 crm_element_value_int(dev, PCMK__XA_ST_AVAILABLE_DEVICES, &ndevices);
2368
2369 op = g_hash_table_lookup(stonith_remote_op_list, id);
2370 if (op == NULL) {
2371 crm_debug("Received query reply for unknown or expired operation %s",
2372 id);
2373 return -EOPNOTSUPP;
2374 }
2375
2376 replies_expected = fencing_active_peers();
2377 if (op->replies_expected < replies_expected) {
2378 replies_expected = op->replies_expected;
2379 }
2380 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2381 have_all_replies = TRUE;
2382 }
2383 host = crm_element_value(msg, PCMK__XA_SRC);
2384 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2385
2386 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2387 op->replies, replies_expected, host,
2388 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2389 if (ndevices > 0) {
2390 peer = add_result(op, host, ndevices, dev);
2391 }
2392
2393 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2394
2395 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2396
2397
2398
2399 if (op->state == st_query && all_topology_devices_found(op)) {
2400
2401 crm_trace("All topology devices found");
2402 request_peer_fencing(op, peer);
2403
2404 } else if (have_all_replies) {
2405 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2406 replies_expected, op->replies);
2407 request_peer_fencing(op, NULL);
2408 }
2409
2410 } else if (op->state == st_query) {
2411 int nverified = count_peer_devices(op, peer, TRUE,
2412 fenced_support_flag(op->action));
2413
2414
2415
2416 if ((peer != NULL) && !host_is_target && nverified) {
2417
2418 crm_trace("Found %d verified device%s",
2419 nverified, pcmk__plural_s(nverified));
2420 request_peer_fencing(op, peer);
2421
2422 } else if (have_all_replies) {
2423 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2424 replies_expected, op->replies);
2425 request_peer_fencing(op, NULL);
2426
2427 } else {
2428 crm_trace("Waiting for more peer results before launching fencing operation");
2429 }
2430
2431 } else if ((peer != NULL) && (op->state == st_done)) {
2432 crm_info("Discarding query result from %s (%d device%s): "
2433 "Operation is %s", peer->host,
2434 peer->ndevices, pcmk__plural_s(peer->ndevices),
2435 stonith__op_state_text(op->state));
2436 }
2437
2438 return pcmk_ok;
2439 }
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450 void
2451 fenced_process_fencing_reply(xmlNode *msg)
2452 {
2453 const char *id = NULL;
2454 const char *device = NULL;
2455 remote_fencing_op_t *op = NULL;
2456 xmlNode *dev = pcmk__xpath_find_one(msg->doc,
2457 "//*[@" PCMK__XA_ST_REMOTE_OP "]",
2458 LOG_ERR);
2459 pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2460
2461 CRM_CHECK(dev != NULL, return);
2462
2463 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2464 CRM_CHECK(id != NULL, return);
2465
2466 dev = stonith__find_xe_with_result(msg);
2467 CRM_CHECK(dev != NULL, return);
2468
2469 stonith__xe_get_result(dev, &result);
2470
2471 device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID);
2472
2473 if (stonith_remote_op_list) {
2474 op = g_hash_table_lookup(stonith_remote_op_list, id);
2475 }
2476
2477 if ((op == NULL) && pcmk__result_ok(&result)) {
2478
2479 const char *client_id = crm_element_value(dev, PCMK__XA_ST_CLIENTID);
2480
2481 op = create_remote_stonith_op(client_id, dev, TRUE);
2482 }
2483
2484 if (op == NULL) {
2485
2486
2487 crm_info("Received peer result of unknown or expired operation %s", id);
2488 pcmk__reset_result(&result);
2489 return;
2490 }
2491
2492 pcmk__reset_result(&op->result);
2493 op->result = result;
2494
2495 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2496 crm_err("Received outdated reply for device %s (instead of %s) to "
2497 "fence (%s) %s. Operation already timed out at peer level.",
2498 device, (const char *) op->devices->data, op->action, op->target);
2499 return;
2500 }
2501
2502 if (pcmk__str_eq(crm_element_value(msg, PCMK__XA_SUBT),
2503 PCMK__VALUE_BROADCAST, pcmk__str_none)) {
2504
2505 if (pcmk__result_ok(&op->result)) {
2506 op->state = st_done;
2507 } else {
2508 op->state = st_failed;
2509 }
2510 finalize_op(op, msg, false);
2511 return;
2512
2513 } else if (!pcmk__str_eq(op->originator, fenced_get_local_node(),
2514 pcmk__str_casei)) {
2515
2516
2517 crm_err("Received non-broadcast fencing result for operation %.8s "
2518 "we do not own (device %s targeting %s)",
2519 op->id, device, op->target);
2520 return;
2521 }
2522
2523 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2524 const char *device = NULL;
2525 const char *reason = op->result.exit_reason;
2526
2527
2528
2529 if (op->state == st_done) {
2530 finalize_op(op, msg, false);
2531 return;
2532 }
2533
2534 device = crm_element_value(msg, PCMK__XA_ST_DEVICE_ID);
2535
2536 if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2537
2538
2539
2540 crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2541 "after successful 'off'",
2542 device, pcmk_exec_status_str(op->result.execution_status),
2543 (reason == NULL)? "" : ": ",
2544 (reason == NULL)? "" : reason,
2545 op->target);
2546 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2547 } else {
2548 crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2549 "%s%s%s%s",
2550 op->action, op->target,
2551 ((device == NULL)? "" : " using "),
2552 ((device == NULL)? "" : device),
2553 op->client_name,
2554 op->originator,
2555 pcmk_exec_status_str(op->result.execution_status),
2556 (reason == NULL)? "" : " (",
2557 (reason == NULL)? "" : reason,
2558 (reason == NULL)? "" : ")");
2559 }
2560
2561 if (pcmk__result_ok(&op->result)) {
2562
2563
2564 advance_topology_device_in_level(op, device, msg);
2565 return;
2566 } else {
2567
2568
2569 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2570 op->state = st_failed;
2571 finalize_op(op, msg, false);
2572 return;
2573 }
2574 }
2575
2576 } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2577 op->state = st_done;
2578 finalize_op(op, msg, false);
2579 return;
2580
2581 } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2582 && (op->devices == NULL)) {
2583
2584 op->state = st_failed;
2585 finalize_op(op, msg, false);
2586 return;
2587
2588 } else {
2589
2590 }
2591
2592
2593 crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2594 op->target, op->originator, op->client_name,
2595 pcmk_exec_status_str(op->result.execution_status));
2596 request_peer_fencing(op, NULL);
2597 }
2598
2599 gboolean
2600 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2601 {
2602 GHashTableIter iter;
2603 time_t now = time(NULL);
2604 remote_fencing_op_t *rop = NULL;
2605
2606 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2607 action == NULL) {
2608 return FALSE;
2609 }
2610
2611 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2612 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2613 if (strcmp(rop->target, target) != 0) {
2614 continue;
2615 } else if (rop->state != st_done) {
2616 continue;
2617
2618
2619
2620 } else if (strcmp(rop->action, action) != 0) {
2621 continue;
2622 } else if ((rop->completed + tolerance) < now) {
2623 continue;
2624 }
2625
2626 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2627 target, action, tolerance, rop->delegate, rop->originator);
2628 return TRUE;
2629 }
2630 return FALSE;
2631 }