This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- fencing_result2xml
- fenced_broadcast_op_result
- handle_local_reply_and_notify
- finalize_op_duplicates
- delegate_from_xml
- finalize_op
- remote_op_watchdog_done
- remote_op_timeout_one
- finalize_timed_out_op
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- fenced_handle_manual_confirmation
- create_remote_stonith_op
- initiate_remote_stonith_op
- find_best_peer
- stonith_choose_peer
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- check_watchdog_fencing_and_wait
- request_peer_fencing
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- fenced_process_fencing_reply
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/msg_xml.h>
28 #include <crm/common/ipc.h>
29 #include <crm/common/ipc_internal.h>
30 #include <crm/cluster/internal.h>
31
32 #include <crm/stonith-ng.h>
33 #include <crm/fencing/internal.h>
34 #include <crm/common/xml.h>
35 #include <crm/common/xml_internal.h>
36
37 #include <crm/common/util.h>
38 #include <pacemaker-fenced.h>
39
40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
41
42
43
44
45
46
47
48 typedef struct device_properties_s {
49
50 gboolean verified;
51
52
53
54
55 gboolean executed[st_phase_max];
56
57 gboolean disallowed[st_phase_max];
58
59 int custom_action_timeout[st_phase_max];
60
61 int delay_max[st_phase_max];
62
63 int delay_base[st_phase_max];
64 } device_properties_t;
65
66 typedef struct {
67
68 char *host;
69
70 gboolean tried;
71
72 int ndevices;
73
74 GHashTable *devices;
75 } peer_device_info_t;
76
77 GHashTable *stonith_remote_op_list = NULL;
78
79 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
80 int call_options);
81
82 static void request_peer_fencing(remote_fencing_op_t *op,
83 peer_device_info_t *peer);
84 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
85 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
86 static int get_op_total_timeout(const remote_fencing_op_t *op,
87 const peer_device_info_t *chosen_peer);
88
89 static gint
90 sort_strings(gconstpointer a, gconstpointer b)
91 {
92 return strcmp(a, b);
93 }
94
95 static void
96 free_remote_query(gpointer data)
97 {
98 if (data != NULL) {
99 peer_device_info_t *peer = data;
100
101 g_hash_table_destroy(peer->devices);
102 free(peer->host);
103 free(peer);
104 }
105 }
106
107 void
108 free_stonith_remote_op_list()
109 {
110 if (stonith_remote_op_list != NULL) {
111 g_hash_table_destroy(stonith_remote_op_list);
112 stonith_remote_op_list = NULL;
113 }
114 }
115
116 struct peer_count_data {
117 const remote_fencing_op_t *op;
118 gboolean verified_only;
119 int count;
120 };
121
122
123
124
125
126
127
128
129
130 static void
131 count_peer_device(gpointer key, gpointer value, gpointer user_data)
132 {
133 device_properties_t *props = (device_properties_t*)value;
134 struct peer_count_data *data = user_data;
135
136 if (!props->executed[data->op->phase]
137 && (!data->verified_only || props->verified)) {
138 ++(data->count);
139 }
140 }
141
142
143
144
145
146
147
148
149
150
151
152 static int
153 count_peer_devices(const remote_fencing_op_t *op,
154 const peer_device_info_t *peer, gboolean verified_only)
155 {
156 struct peer_count_data data;
157
158 data.op = op;
159 data.verified_only = verified_only;
160 data.count = 0;
161 if (peer) {
162 g_hash_table_foreach(peer->devices, count_peer_device, &data);
163 }
164 return data.count;
165 }
166
167
168
169
170
171
172
173
174
175
176
177 static device_properties_t *
178 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
179 const char *device)
180 {
181 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
182
183 return (props && !props->executed[op->phase]
184 && !props->disallowed[op->phase])? props : NULL;
185 }
186
187
188
189
190
191
192
193
194
195
196
197
198 static gboolean
199 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
200 const char *device, gboolean verified_devices_only)
201 {
202 device_properties_t *props = find_peer_device(op, peer, device);
203
204 if ((props == NULL) || (verified_devices_only && !props->verified)) {
205 return FALSE;
206 }
207
208 crm_trace("Removing %s from %s (%d remaining)",
209 device, peer->host, count_peer_devices(op, peer, FALSE));
210 props->executed[op->phase] = TRUE;
211 return TRUE;
212 }
213
214 static void
215 clear_remote_op_timers(remote_fencing_op_t * op)
216 {
217 if (op->query_timer) {
218 g_source_remove(op->query_timer);
219 op->query_timer = 0;
220 }
221 if (op->op_timer_total) {
222 g_source_remove(op->op_timer_total);
223 op->op_timer_total = 0;
224 }
225 if (op->op_timer_one) {
226 g_source_remove(op->op_timer_one);
227 op->op_timer_one = 0;
228 }
229 }
230
231 static void
232 free_remote_op(gpointer data)
233 {
234 remote_fencing_op_t *op = data;
235
236 crm_log_xml_debug(op->request, "Destroying");
237
238 clear_remote_op_timers(op);
239
240 free(op->id);
241 free(op->action);
242 free(op->delegate);
243 free(op->target);
244 free(op->client_id);
245 free(op->client_name);
246 free(op->originator);
247
248 if (op->query_results) {
249 g_list_free_full(op->query_results, free_remote_query);
250 }
251 if (op->request) {
252 free_xml(op->request);
253 op->request = NULL;
254 }
255 if (op->devices_list) {
256 g_list_free_full(op->devices_list, free);
257 op->devices_list = NULL;
258 }
259 g_list_free_full(op->automatic_list, free);
260 g_list_free(op->duplicates);
261
262 pcmk__reset_result(&op->result);
263 free(op);
264 }
265
266 void
267 init_stonith_remote_op_hash_table(GHashTable **table)
268 {
269 if (*table == NULL) {
270 *table = pcmk__strkey_table(NULL, free_remote_op);
271 }
272 }
273
274
275
276
277
278
279
280
281
282 static const char *
283 op_requested_action(const remote_fencing_op_t *op)
284 {
285 return ((op->phase > st_phase_requested)? "reboot" : op->action);
286 }
287
288
289
290
291
292
293
294 static void
295 op_phase_off(remote_fencing_op_t *op)
296 {
297 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
298 CRM_XS " id=%.8s", op->target, op->id);
299 op->phase = st_phase_off;
300
301
302
303
304 strcpy(op->action, "off");
305 }
306
307
308
309
310
311
312
313 static void
314 op_phase_on(remote_fencing_op_t *op)
315 {
316 GList *iter = NULL;
317
318 crm_info("Remapped 'off' targeting %s complete, "
319 "remapping to 'on' for %s " CRM_XS " id=%.8s",
320 op->target, op->client_name, op->id);
321 op->phase = st_phase_on;
322 strcpy(op->action, "on");
323
324
325
326
327 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
328 GList *match = g_list_find_custom(op->devices_list, iter->data,
329 sort_strings);
330
331 if (match) {
332 op->devices_list = g_list_remove(op->devices_list, match->data);
333 }
334 }
335 g_list_free_full(op->automatic_list, free);
336 op->automatic_list = NULL;
337
338
339 op->devices = op->devices_list;
340 }
341
342
343
344
345
346
347
348 static void
349 undo_op_remap(remote_fencing_op_t *op)
350 {
351 if (op->phase > 0) {
352 crm_info("Undoing remap of reboot targeting %s for %s "
353 CRM_XS " id=%.8s", op->target, op->client_name, op->id);
354 op->phase = st_phase_requested;
355 strcpy(op->action, "reboot");
356 }
357 }
358
359
360
361
362
363
364
365
366
367
368 static xmlNode *
369 fencing_result2xml(remote_fencing_op_t *op)
370 {
371 xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
372
373 crm_xml_add_int(notify_data, "state", op->state);
374 crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
375 crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
376 crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
377 crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
378 crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
379 crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
380 crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
381
382 stonith__xe_set_result(notify_data, &op->result);
383 return notify_data;
384 }
385
386
387
388
389
390
391
392
393 void
394 fenced_broadcast_op_result(remote_fencing_op_t *op, bool op_merged)
395 {
396 static int count = 0;
397 xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
398 xmlNode *notify_data = fencing_result2xml(op);
399
400 count++;
401 crm_trace("Broadcasting result to peers");
402 crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
403 crm_xml_add(bcast, F_SUBTYPE, "broadcast");
404 crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
405 crm_xml_add_int(bcast, "count", count);
406
407 if (op_merged) {
408 pcmk__xe_set_bool_attr(bcast, F_STONITH_MERGED, true);
409 }
410
411 add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
412 send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
413 free_xml(notify_data);
414 free_xml(bcast);
415
416 return;
417 }
418
419
420
421
422
423
424
425
426 static void
427 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
428 {
429 xmlNode *notify_data = NULL;
430 xmlNode *reply = NULL;
431 pcmk__client_t *client = NULL;
432
433 if (op->notify_sent == TRUE) {
434
435 return;
436 }
437
438
439 crm_xml_add_int(data, "state", op->state);
440 crm_xml_add(data, F_STONITH_TARGET, op->target);
441 crm_xml_add(data, F_STONITH_OPERATION, op->action);
442
443 reply = fenced_construct_reply(op->request, data, &op->result);
444 crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
445
446
447 client = pcmk__find_client_by_id(op->client_id);
448 if (client == NULL) {
449 crm_trace("Skipping reply to %s: no longer a client", op->client_id);
450 } else {
451 do_local_reply(reply, client, op->call_options);
452 }
453
454
455 notify_data = fencing_result2xml(op);
456 fenced_send_notification(T_STONITH_NOTIFY_FENCE, &op->result, notify_data);
457 free_xml(notify_data);
458 fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
459
460
461 op->notify_sent = TRUE;
462 free_xml(reply);
463 }
464
465
466
467
468
469
470
471
472 static void
473 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
474 {
475 for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
476 remote_fencing_op_t *other = iter->data;
477
478 if (other->state == st_duplicate) {
479 other->state = op->state;
480 crm_debug("Performing duplicate notification for %s@%s: %s "
481 CRM_XS " id=%.8s",
482 other->client_name, other->originator,
483 pcmk_exec_status_str(op->result.execution_status),
484 other->id);
485 pcmk__copy_result(&op->result, &other->result);
486 finalize_op(other, data, true);
487
488 } else {
489
490 crm_err("Skipping duplicate notification for %s@%s "
491 CRM_XS " state=%s id=%.8s",
492 other->client_name, other->originator,
493 stonith_op_state_str(other->state), other->id);
494 }
495 }
496 }
497
498 static char *
499 delegate_from_xml(xmlNode *xml)
500 {
501 xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
502
503 if (match == NULL) {
504 return crm_element_value_copy(xml, F_ORIG);
505 } else {
506 return crm_element_value_copy(match, F_STONITH_DELEGATE);
507 }
508 }
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526 static void
527 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
528 {
529 int level = LOG_ERR;
530 const char *subt = NULL;
531 xmlNode *local_data = NULL;
532 gboolean op_merged = FALSE;
533
534 CRM_CHECK((op != NULL), return);
535
536 if (op->notify_sent) {
537
538 crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
539 "Result arrived too late " CRM_XS " id=%.8s",
540 op->action, (op->target? " targeting " : ""),
541 (op->target? op->target : ""),
542 (op->delegate? op->delegate : "unknown node"),
543 op->client_name, op->originator,
544 (op_merged? " (merged)" : ""),
545 op->id);
546 return;
547 }
548
549 set_fencing_completed(op);
550 clear_remote_op_timers(op);
551 undo_op_remap(op);
552
553 if (data == NULL) {
554 data = create_xml_node(NULL, "remote-op");
555 local_data = data;
556
557 } else if (op->delegate == NULL) {
558 switch (op->result.execution_status) {
559 case PCMK_EXEC_NO_FENCE_DEVICE:
560 break;
561 case PCMK_EXEC_INVALID:
562 if (op->result.exit_status == CRM_EX_EXPIRED) {
563 break;
564 }
565
566 default:
567 op->delegate = delegate_from_xml(data);
568 break;
569 }
570 }
571
572 if (dup || (crm_element_value(data, F_STONITH_MERGED) != NULL)) {
573 op_merged = true;
574 }
575
576
577
578
579 subt = crm_element_value(data, F_SUBTYPE);
580 if (!dup && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
581
582 fenced_broadcast_op_result(op, op_merged);
583 free_xml(local_data);
584 return;
585 }
586
587 if (pcmk__result_ok(&op->result) || dup
588 || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
589 level = LOG_NOTICE;
590 }
591 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
592 CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
593 (op->target? op->target : ""),
594 (op->delegate? op->delegate : "unknown node"),
595 op->client_name, op->originator,
596 (op_merged? " (merged)" : ""),
597 crm_exit_str(op->result.exit_status),
598 pcmk_exec_status_str(op->result.execution_status),
599 ((op->result.exit_reason == NULL)? "" : ": "),
600 ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
601 op->id);
602
603 handle_local_reply_and_notify(op, data);
604
605 if (!dup) {
606 finalize_op_duplicates(op, data);
607 }
608
609
610
611
612 if (op->query_results) {
613 g_list_free_full(op->query_results, free_remote_query);
614 op->query_results = NULL;
615 }
616 if (op->request) {
617 free_xml(op->request);
618 op->request = NULL;
619 }
620
621 free_xml(local_data);
622 }
623
624
625
626
627
628
629
630
631
632 static gboolean
633 remote_op_watchdog_done(gpointer userdata)
634 {
635 remote_fencing_op_t *op = userdata;
636
637 op->op_timer_one = 0;
638
639 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
640 CRM_XS " id=%.8s",
641 op->action, op->target, op->client_name, op->id);
642 op->state = st_done;
643 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
644 finalize_op(op, NULL, false);
645 return G_SOURCE_REMOVE;
646 }
647
648 static gboolean
649 remote_op_timeout_one(gpointer userdata)
650 {
651 remote_fencing_op_t *op = userdata;
652
653 op->op_timer_one = 0;
654
655 crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
656 " id=%.8s", op->action, op->target, op->client_name, op->id);
657 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
658 "Peer did not return fence result within timeout");
659
660
661 request_peer_fencing(op, NULL);
662 return FALSE;
663 }
664
665
666
667
668
669
670
671
672 static void
673 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
674 {
675 op->op_timer_total = 0;
676
677 crm_debug("Action '%s' targeting %s for client %s timed out "
678 CRM_XS " id=%.8s",
679 op->action, op->target, op->client_name, op->id);
680
681 if (op->phase == st_phase_on) {
682
683
684
685
686 op->state = st_done;
687 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
688 } else {
689 op->state = st_failed;
690 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
691 }
692 finalize_op(op, NULL, false);
693 }
694
695
696
697
698
699
700
701
702
703 static gboolean
704 remote_op_timeout(gpointer userdata)
705 {
706 remote_fencing_op_t *op = userdata;
707
708 if (op->state == st_done) {
709 crm_debug("Action '%s' targeting %s for client %s already completed "
710 CRM_XS " id=%.8s",
711 op->action, op->target, op->client_name, op->id);
712 } else {
713 finalize_timed_out_op(userdata, "Fencing did not complete within a "
714 "total timeout based on the "
715 "configured timeout and retries for "
716 "any devices attempted");
717 }
718 return G_SOURCE_REMOVE;
719 }
720
721 static gboolean
722 remote_op_query_timeout(gpointer data)
723 {
724 remote_fencing_op_t *op = data;
725
726 op->query_timer = 0;
727 if (op->state == st_done) {
728 crm_debug("Operation %.8s targeting %s already completed",
729 op->id, op->target);
730 } else if (op->state == st_exec) {
731 crm_debug("Operation %.8s targeting %s already in progress",
732 op->id, op->target);
733 } else if (op->query_results) {
734
735 crm_debug("Query %.8s targeting %s complete (state=%s)",
736 op->id, op->target, stonith_op_state_str(op->state));
737 request_peer_fencing(op, NULL);
738 } else {
739 crm_debug("Query %.8s targeting %s timed out (state=%s)",
740 op->id, op->target, stonith_op_state_str(op->state));
741 if (op->op_timer_total) {
742 g_source_remove(op->op_timer_total);
743 op->op_timer_total = 0;
744 }
745 finalize_timed_out_op(op, "No capable peers replied to device query "
746 "within timeout");
747 }
748
749 return FALSE;
750 }
751
752 static gboolean
753 topology_is_empty(stonith_topology_t *tp)
754 {
755 int i;
756
757 if (tp == NULL) {
758 return TRUE;
759 }
760
761 for (i = 0; i < ST_LEVEL_MAX; i++) {
762 if (tp->levels[i] != NULL) {
763 return FALSE;
764 }
765 }
766 return TRUE;
767 }
768
769
770
771
772
773
774
775
776 static void
777 add_required_device(remote_fencing_op_t *op, const char *device)
778 {
779 GList *match = g_list_find_custom(op->automatic_list, device,
780 sort_strings);
781
782 if (!match) {
783 op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
784 }
785 }
786
787
788
789
790
791
792
793
794 static void
795 remove_required_device(remote_fencing_op_t *op, const char *device)
796 {
797 GList *match = g_list_find_custom(op->automatic_list, device,
798 sort_strings);
799
800 if (match) {
801 op->automatic_list = g_list_remove(op->automatic_list, match->data);
802 }
803 }
804
805
806 static void
807 set_op_device_list(remote_fencing_op_t * op, GList *devices)
808 {
809 GList *lpc = NULL;
810
811 if (op->devices_list) {
812 g_list_free_full(op->devices_list, free);
813 op->devices_list = NULL;
814 }
815 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
816 op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
817 }
818 op->devices = op->devices_list;
819 }
820
821
822
823
824
825
826
827
828
829
830 static gboolean
831 topology_matches(const stonith_topology_t *tp, const char *node)
832 {
833 regex_t r_patt;
834
835 CRM_CHECK(node && tp && tp->target, return FALSE);
836 switch (tp->kind) {
837 case fenced_target_by_attribute:
838
839
840
841
842
843
844 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
845 crm_notice("Matched %s with %s by attribute", node, tp->target);
846 return TRUE;
847 }
848 break;
849
850 case fenced_target_by_pattern:
851
852
853
854 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
855 crm_info("Bad regex '%s' for fencing level", tp->target);
856 } else {
857 int status = regexec(&r_patt, node, 0, NULL, 0);
858
859 regfree(&r_patt);
860 if (status == 0) {
861 crm_notice("Matched %s with %s by name", node, tp->target);
862 return TRUE;
863 }
864 }
865 break;
866
867 case fenced_target_by_name:
868 crm_trace("Testing %s against %s", node, tp->target);
869 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
870
871 default:
872 break;
873 }
874 crm_trace("No match for %s with %s", node, tp->target);
875 return FALSE;
876 }
877
878 stonith_topology_t *
879 find_topology_for_host(const char *host)
880 {
881 GHashTableIter tIter;
882 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
883
884 if(tp != NULL) {
885 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
886 return tp;
887 }
888
889 g_hash_table_iter_init(&tIter, topology);
890 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
891 if (topology_matches(tp, host)) {
892 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
893 return tp;
894 }
895 }
896
897 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
898 return NULL;
899 }
900
901
902
903
904
905
906
907
908
909
910
911
912 static int
913 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
914 {
915 stonith_topology_t *tp = NULL;
916
917 if (op->target) {
918 tp = find_topology_for_host(op->target);
919 }
920 if (topology_is_empty(tp)) {
921 return empty_ok? pcmk_rc_ok : ENODEV;
922 }
923
924 CRM_ASSERT(tp->levels != NULL);
925
926 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
927
928
929 undo_op_remap(op);
930
931 do {
932 op->level++;
933
934 } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
935
936 if (op->level < ST_LEVEL_MAX) {
937 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
938 "for client %s@%s (id=%.8s)",
939 op->level, op->target, g_list_length(tp->levels[op->level]),
940 op->client_name, op->originator, op->id);
941 set_op_device_list(op, tp->levels[op->level]);
942
943
944 if (op->level > 1 && op->delay > 0) {
945 op->delay = 0;
946 }
947
948 if (g_list_next(op->devices_list) && pcmk__str_eq(op->action, "reboot", pcmk__str_casei)) {
949
950
951
952
953
954 op_phase_off(op);
955 }
956 return pcmk_rc_ok;
957 }
958
959 crm_notice("All fencing options targeting %s for client %s@%s failed "
960 CRM_XS " id=%.8s",
961 op->target, op->client_name, op->originator, op->id);
962 return ENODEV;
963 }
964
965
966
967
968
969
970 static void
971 merge_duplicates(remote_fencing_op_t * op)
972 {
973 GHashTableIter iter;
974 remote_fencing_op_t *other = NULL;
975
976 time_t now = time(NULL);
977
978 g_hash_table_iter_init(&iter, stonith_remote_op_list);
979 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
980 const char *other_action = op_requested_action(other);
981
982 if (!strcmp(op->id, other->id)) {
983 continue;
984 }
985 if (other->state > st_exec) {
986 crm_trace("%.8s not duplicate of %.8s: not in progress",
987 op->id, other->id);
988 continue;
989 }
990 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
991 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
992 op->id, other->id, op->target, other->target);
993 continue;
994 }
995 if (!pcmk__str_eq(op->action, other_action, pcmk__str_casei)) {
996 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
997 op->id, other->id, op->action, other_action);
998 continue;
999 }
1000 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1001 crm_trace("%.8s not duplicate of %.8s: same client %s",
1002 op->id, other->id, op->client_name);
1003 continue;
1004 }
1005 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1006 crm_trace("%.8s not duplicate of %.8s: suicide for %s",
1007 op->id, other->id, other->target);
1008 continue;
1009 }
1010 if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
1011 crm_notice("Failing action '%s' targeting %s originating from "
1012 "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
1013 other->action, other->target, other->client_name,
1014 other->originator, other->id);
1015 crm_trace("%.8s not duplicate of %.8s: originator dead",
1016 op->id, other->id);
1017 other->state = st_failed;
1018 continue;
1019 }
1020 if ((other->total_timeout > 0)
1021 && (now > (other->total_timeout + other->created))) {
1022 crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
1023 op->id, other->id, now, other->created,
1024 other->total_timeout);
1025 continue;
1026 }
1027
1028
1029
1030
1031 other->duplicates = g_list_append(other->duplicates, op);
1032 if (other->total_timeout == 0) {
1033 other->total_timeout = op->total_timeout =
1034 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1035 crm_trace("Best guess as to timeout used for %.8s: %d",
1036 other->id, other->total_timeout);
1037 }
1038 crm_notice("Merging fencing action '%s' targeting %s originating from "
1039 "client %s with identical request from %s@%s "
1040 CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1041 op->action, op->target, op->client_name,
1042 other->client_name, other->originator,
1043 op->id, other->id, other->total_timeout);
1044 report_timeout_period(op, other->total_timeout);
1045 op->state = st_duplicate;
1046 }
1047 }
1048
1049 static uint32_t fencing_active_peers(void)
1050 {
1051 uint32_t count = 0;
1052 crm_node_t *entry;
1053 GHashTableIter gIter;
1054
1055 g_hash_table_iter_init(&gIter, crm_peer_cache);
1056 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1057 if(fencing_peer_active(entry)) {
1058 count++;
1059 }
1060 }
1061 return count;
1062 }
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073 int
1074 fenced_handle_manual_confirmation(pcmk__client_t *client, xmlNode *msg)
1075 {
1076 remote_fencing_op_t *op = NULL;
1077 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
1078
1079 CRM_CHECK(dev != NULL, return EPROTO);
1080
1081 crm_notice("Received manual confirmation that %s has been fenced",
1082 crm_str(crm_element_value(dev, F_STONITH_TARGET)));
1083 op = initiate_remote_stonith_op(client, msg, TRUE);
1084 if (op == NULL) {
1085 return EPROTO;
1086 }
1087 op->state = st_done;
1088 set_fencing_completed(op);
1089 op->delegate = strdup("a human");
1090
1091
1092 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1093 finalize_op(op, msg, false);
1094
1095
1096
1097
1098 return EINPROGRESS;
1099 }
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112 void *
1113 create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer)
1114 {
1115 remote_fencing_op_t *op = NULL;
1116 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
1117 int call_options = 0;
1118 const char *operation = NULL;
1119
1120 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1121
1122
1123
1124 if (peer && dev) {
1125 const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1126
1127 CRM_CHECK(op_id != NULL, return NULL);
1128
1129 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1130 if (op) {
1131 crm_debug("Reusing existing remote fencing op %.8s for %s",
1132 op_id, ((client == NULL)? "unknown client" : client));
1133 return op;
1134 }
1135 }
1136
1137 op = calloc(1, sizeof(remote_fencing_op_t));
1138 CRM_ASSERT(op != NULL);
1139
1140 crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1141
1142 crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
1143
1144 if (peer && dev) {
1145 op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1146 } else {
1147 op->id = crm_generate_uuid();
1148 }
1149
1150 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1151
1152 op->state = st_query;
1153 op->replies_expected = fencing_active_peers();
1154 op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1155 op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1156 op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE);
1157 op->created = time(NULL);
1158
1159 if (op->originator == NULL) {
1160
1161 op->originator = strdup(stonith_our_uname);
1162 }
1163
1164 CRM_LOG_ASSERT(client != NULL);
1165 if (client) {
1166 op->client_id = strdup(client);
1167 }
1168
1169
1170
1171 operation = crm_element_value(request, F_STONITH_OPERATION);
1172
1173 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1174 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1175 (unsigned long) getpid());
1176 } else {
1177 op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1178 }
1179
1180 op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1181 op->request = copy_xml(request);
1182 crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1183 op->call_options = call_options;
1184
1185 crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1186
1187 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1188 "base timeout %d, %u %s expected)",
1189 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1190 op->target, op->client_name, op->base_timeout,
1191 op->replies_expected,
1192 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1193
1194 if (op->call_options & st_opt_cs_nodeid) {
1195 int nodeid;
1196 crm_node_t *node;
1197
1198 pcmk__scan_min_int(op->target, &nodeid, 0);
1199 node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
1200
1201
1202 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1203
1204 if (node && node->uname) {
1205 free(op->target);
1206 op->target = strdup(node->uname);
1207
1208 } else {
1209 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1210 }
1211 }
1212
1213
1214 merge_duplicates(op);
1215
1216 if (op->state != st_duplicate) {
1217
1218 fenced_send_notification(T_STONITH_NOTIFY_HISTORY, NULL, NULL);
1219 }
1220
1221
1222 stonith_fence_history_trim();
1223
1224 return op;
1225 }
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237 remote_fencing_op_t *
1238 initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request,
1239 gboolean manual_ack)
1240 {
1241 int query_timeout = 0;
1242 xmlNode *query = NULL;
1243 const char *client_id = NULL;
1244 remote_fencing_op_t *op = NULL;
1245 const char *relay_op_id = NULL;
1246 const char *operation = NULL;
1247
1248 if (client) {
1249 client_id = client->id;
1250 } else {
1251 client_id = crm_element_value(request, F_STONITH_CLIENTID);
1252 }
1253
1254 CRM_LOG_ASSERT(client_id != NULL);
1255 op = create_remote_stonith_op(client_id, request, FALSE);
1256 op->owner = TRUE;
1257 if (manual_ack) {
1258 return op;
1259 }
1260
1261 CRM_CHECK(op->action, return NULL);
1262
1263 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1264 op->state = st_failed;
1265 }
1266
1267 switch (op->state) {
1268 case st_failed:
1269
1270 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1271 "All topology levels failed");
1272 crm_warn("Could not request peer fencing (%s) targeting %s "
1273 CRM_XS " id=%.8s", op->action, op->target, op->id);
1274 finalize_op(op, NULL, false);
1275 return op;
1276
1277 case st_duplicate:
1278 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1279 CRM_XS " id=%.8s", op->action, op->target, op->id);
1280 return op;
1281
1282 default:
1283 crm_notice("Requesting peer fencing (%s) targeting %s "
1284 CRM_XS " id=%.8s state=%s base_timeout=%d",
1285 op->action, op->target, op->id,
1286 stonith_op_state_str(op->state), op->base_timeout);
1287 }
1288
1289 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1290 NULL, op->call_options);
1291
1292 crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1293 crm_xml_add(query, F_STONITH_TARGET, op->target);
1294 crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1295 crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1296 crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1297 crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1298 crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1299
1300
1301 operation = crm_element_value(request, F_STONITH_OPERATION);
1302 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1303 relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1304 if (relay_op_id) {
1305 crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1306 }
1307 }
1308
1309 send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1310 free_xml(query);
1311
1312 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1313 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1314
1315 return op;
1316 }
1317
1318 enum find_best_peer_options {
1319
1320 FIND_PEER_SKIP_TARGET = 0x0001,
1321
1322 FIND_PEER_TARGET_ONLY = 0x0002,
1323
1324 FIND_PEER_VERIFIED_ONLY = 0x0004,
1325 };
1326
1327 static peer_device_info_t *
1328 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1329 {
1330 GList *iter = NULL;
1331 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1332
1333 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1334 return NULL;
1335 }
1336
1337 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1338 peer_device_info_t *peer = iter->data;
1339
1340 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1341 peer->host, op->target, peer->ndevices,
1342 pcmk__plural_s(peer->ndevices), peer->tried, options);
1343 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1344 continue;
1345 }
1346 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1347 continue;
1348 }
1349
1350 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1351
1352 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1353 return peer;
1354 }
1355
1356 } else if ((peer->tried == FALSE)
1357 && count_peer_devices(op, peer, verified_devices_only)) {
1358
1359
1360 crm_trace("Simple fencing");
1361 return peer;
1362 }
1363 }
1364
1365 return NULL;
1366 }
1367
1368 static peer_device_info_t *
1369 stonith_choose_peer(remote_fencing_op_t * op)
1370 {
1371 const char *device = NULL;
1372 peer_device_info_t *peer = NULL;
1373 uint32_t active = fencing_active_peers();
1374
1375 do {
1376 if (op->devices) {
1377 device = op->devices->data;
1378 crm_trace("Checking for someone to fence (%s) %s using %s",
1379 op->action, op->target, device);
1380 } else {
1381 crm_trace("Checking for someone to fence (%s) %s",
1382 op->action, op->target);
1383 }
1384
1385
1386 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1387 if (peer) {
1388 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1389 return peer;
1390 }
1391
1392 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1393 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1394 return NULL;
1395 }
1396
1397
1398 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1399 if (peer) {
1400 crm_trace("Found best unverified peer %s", peer->host);
1401 return peer;
1402 }
1403
1404
1405
1406
1407 if (op->phase != st_phase_on) {
1408 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1409 if (peer) {
1410 crm_trace("%s will fence itself", peer->host);
1411 return peer;
1412 }
1413 }
1414
1415
1416
1417
1418 } while ((op->phase != st_phase_on)
1419 && pcmk_is_set(op->call_options, st_opt_topology)
1420 && (advance_topology_level(op, false) == pcmk_rc_ok));
1421
1422 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1423 op->action, op->target, (device? device : "any device"));
1424 return NULL;
1425 }
1426
1427 static int
1428 get_device_timeout(const remote_fencing_op_t *op,
1429 const peer_device_info_t *peer, const char *device)
1430 {
1431 device_properties_t *props;
1432
1433 if (!peer || !device) {
1434 return op->base_timeout;
1435 }
1436
1437 props = g_hash_table_lookup(peer->devices, device);
1438 if (!props) {
1439 return op->base_timeout;
1440 }
1441
1442 return (props->custom_action_timeout[op->phase]?
1443 props->custom_action_timeout[op->phase] : op->base_timeout)
1444 + props->delay_max[op->phase];
1445 }
1446
1447 struct timeout_data {
1448 const remote_fencing_op_t *op;
1449 const peer_device_info_t *peer;
1450 int total_timeout;
1451 };
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461 static void
1462 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1463 {
1464 const char *device_id = key;
1465 device_properties_t *props = value;
1466 struct timeout_data *timeout = user_data;
1467
1468 if (!props->executed[timeout->op->phase]
1469 && !props->disallowed[timeout->op->phase]) {
1470 timeout->total_timeout += get_device_timeout(timeout->op,
1471 timeout->peer, device_id);
1472 }
1473 }
1474
1475 static int
1476 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
1477 {
1478 struct timeout_data timeout;
1479
1480 timeout.op = op;
1481 timeout.peer = peer;
1482 timeout.total_timeout = 0;
1483
1484 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1485
1486 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1487 }
1488
1489 static int
1490 get_op_total_timeout(const remote_fencing_op_t *op,
1491 const peer_device_info_t *chosen_peer)
1492 {
1493 int total_timeout = 0;
1494 stonith_topology_t *tp = find_topology_for_host(op->target);
1495
1496 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1497 int i;
1498 GList *device_list = NULL;
1499 GList *iter = NULL;
1500
1501
1502
1503
1504
1505
1506
1507
1508 for (i = 0; i < ST_LEVEL_MAX; i++) {
1509 if (!tp->levels[i]) {
1510 continue;
1511 }
1512 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1513 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1514 const peer_device_info_t *peer = iter->data;
1515
1516 if (find_peer_device(op, peer, device_list->data)) {
1517 total_timeout += get_device_timeout(op, peer,
1518 device_list->data);
1519 break;
1520 }
1521 }
1522 }
1523 }
1524
1525 } else if (chosen_peer) {
1526 total_timeout = get_peer_timeout(op, chosen_peer);
1527 } else {
1528 total_timeout = op->base_timeout;
1529 }
1530
1531 return total_timeout ? total_timeout : op->base_timeout;
1532 }
1533
1534 static void
1535 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1536 {
1537 GList *iter = NULL;
1538 xmlNode *update = NULL;
1539 const char *client_node = NULL;
1540 const char *client_id = NULL;
1541 const char *call_id = NULL;
1542
1543 if (op->call_options & st_opt_sync_call) {
1544
1545
1546
1547
1548 return;
1549 } else if (!op->request) {
1550 return;
1551 }
1552
1553 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1554 client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1555 call_id = crm_element_value(op->request, F_STONITH_CALLID);
1556 client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1557 if (!client_node || !call_id || !client_id) {
1558 return;
1559 }
1560
1561 if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1562
1563 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1564 return;
1565 }
1566
1567
1568 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1569 crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1570 crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1571 crm_xml_add(update, F_STONITH_CALLID, call_id);
1572 crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1573
1574 send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1575
1576 free_xml(update);
1577
1578 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1579 remote_fencing_op_t *dup = iter->data;
1580
1581 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1582 dup->id, dup->client_name);
1583 report_timeout_period(iter->data, op_timeout);
1584 }
1585 }
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595 static void
1596 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1597 xmlNode *msg)
1598 {
1599
1600 if (op->devices) {
1601 op->devices = op->devices->next;
1602 }
1603
1604
1605 if ((op->phase == st_phase_requested) && pcmk__str_eq(op->action, "on", pcmk__str_casei)) {
1606
1607 remove_required_device(op, device);
1608
1609
1610
1611
1612 if (op->devices == NULL) {
1613 op->devices = op->automatic_list;
1614 }
1615 }
1616
1617 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1618
1619
1620
1621
1622 op_phase_on(op);
1623 }
1624
1625
1626 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1627
1628 if (op->devices) {
1629
1630 crm_trace("Next targeting %s on behalf of %s@%s",
1631 op->target, op->client_name, op->originator);
1632
1633
1634 if (op->delay > 0) {
1635 op->delay = 0;
1636 }
1637
1638 request_peer_fencing(op, NULL);
1639 } else {
1640
1641 crm_trace("Marking complex fencing op targeting %s as complete",
1642 op->target);
1643 op->state = st_done;
1644 finalize_op(op, msg, false);
1645 }
1646 }
1647
1648 static gboolean
1649 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
1650 {
1651 if (node_does_watchdog_fencing(op->target)) {
1652
1653 crm_notice("Waiting %lds for %s to self-fence (%s) for "
1654 "client %s " CRM_XS " id=%.8s",
1655 (stonith_watchdog_timeout_ms / 1000),
1656 op->target, op->action, op->client_name, op->id);
1657 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
1658 remote_op_watchdog_done, op);
1659 return TRUE;
1660 } else {
1661 crm_debug("Skipping fallback to watchdog-fencing as %s is "
1662 "not in host-list", op->target);
1663 }
1664 return FALSE;
1665 }
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675 static void
1676 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
1677 {
1678 const char *device = NULL;
1679 int timeout;
1680
1681 CRM_CHECK(op != NULL, return);
1682
1683 crm_trace("Action %.8s targeting %s for %s is %s",
1684 op->id, op->target, op->client_name,
1685 stonith_op_state_str(op->state));
1686 timeout = op->base_timeout;
1687 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1688 peer = stonith_choose_peer(op);
1689 }
1690
1691 if (!op->op_timer_total) {
1692 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1693 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1694 report_timeout_period(op, op->total_timeout);
1695 crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1696 CRM_XS "id=%.8s",
1697 op->total_timeout, op->target, op->client_name, op->id);
1698 }
1699
1700 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1701
1702
1703
1704
1705
1706
1707
1708
1709 peer = stonith_choose_peer(op);
1710
1711 device = op->devices->data;
1712 timeout = get_device_timeout(op, peer, device);
1713 }
1714
1715 if (peer) {
1716 int timeout_one = 0;
1717 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1718
1719 crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1720 crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1721 crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1722 crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1723 crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1724 crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1725 crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1726 crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1727 crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
1728
1729 if (device) {
1730 timeout_one = TIMEOUT_MULTIPLY_FACTOR *
1731 get_device_timeout(op, peer, device);
1732 crm_notice("Requesting that %s perform '%s' action targeting %s "
1733 "using %s " CRM_XS " for client %s (%ds)",
1734 peer->host, op->action, op->target, device,
1735 op->client_name, timeout_one);
1736 crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1737
1738 } else {
1739 timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1740 crm_notice("Requesting that %s perform '%s' action targeting %s "
1741 CRM_XS " for client %s (%ds, %lds)",
1742 peer->host, op->action, op->target, op->client_name,
1743 timeout_one, stonith_watchdog_timeout_ms);
1744 }
1745
1746 op->state = st_exec;
1747 if (op->op_timer_one) {
1748 g_source_remove(op->op_timer_one);
1749 }
1750
1751 if (!(stonith_watchdog_timeout_ms > 0 && (
1752 (pcmk__str_eq(device, STONITH_WATCHDOG_ID,
1753 pcmk__str_none)) ||
1754 (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1755 && !pcmk__str_eq(op->action, "on", pcmk__str_casei))) &&
1756 check_watchdog_fencing_and_wait(op))) {
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1776 }
1777
1778 send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1779 peer->tried = TRUE;
1780 free_xml(remote_op);
1781 return;
1782
1783 } else if (op->phase == st_phase_on) {
1784
1785
1786
1787 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1788 "after successful 'off'", device, op->target);
1789 advance_topology_device_in_level(op, device, NULL);
1790 return;
1791
1792 } else if (op->owner == FALSE) {
1793 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1794 op->action, op->target, op->client_name);
1795
1796 } else if (op->query_timer == 0) {
1797
1798 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1799 CRM_XS " state=%s", op->action, op->target, op->client_name,
1800 stonith_op_state_str(op->state));
1801 CRM_CHECK(op->state < st_done, return);
1802 finalize_timed_out_op(op, "All nodes failed, or are unable, to "
1803 "fence target");
1804
1805 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1806
1807
1808
1809
1810 if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
1811 STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
1812 if (check_watchdog_fencing_and_wait(op)) {
1813 return;
1814 }
1815 }
1816
1817 if (op->state == st_query) {
1818 crm_info("No peers (out of %d) have devices capable of fencing "
1819 "(%s) %s for client %s " CRM_XS " state=%s",
1820 op->replies, op->action, op->target, op->client_name,
1821 stonith_op_state_str(op->state));
1822
1823 pcmk__reset_result(&op->result);
1824 pcmk__set_result(&op->result, CRM_EX_ERROR,
1825 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1826 } else {
1827 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1828 pcmk__reset_result(&op->result);
1829 pcmk__set_result(&op->result, CRM_EX_ERROR,
1830 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
1831 }
1832
1833
1834
1835
1836
1837
1838
1839 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1840 "for client %s " CRM_XS " state=%s",
1841 op->replies, op->action, op->target, op->client_name,
1842 stonith_op_state_str(op->state));
1843 }
1844
1845 op->state = st_failed;
1846 finalize_op(op, NULL, false);
1847
1848 } else {
1849 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1850 "for client %s " CRM_XS " id=%.8s",
1851 op->action, op->target, (device? " using " : ""),
1852 (device? device : ""), op->client_name, op->id);
1853 }
1854 }
1855
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867 static gint
1868 sort_peers(gconstpointer a, gconstpointer b)
1869 {
1870 const peer_device_info_t *peer_a = a;
1871 const peer_device_info_t *peer_b = b;
1872
1873 return (peer_b->ndevices - peer_a->ndevices);
1874 }
1875
1876
1877
1878
1879
1880 static gboolean
1881 all_topology_devices_found(remote_fencing_op_t * op)
1882 {
1883 GList *device = NULL;
1884 GList *iter = NULL;
1885 device_properties_t *match = NULL;
1886 stonith_topology_t *tp = NULL;
1887 gboolean skip_target = FALSE;
1888 int i;
1889
1890 tp = find_topology_for_host(op->target);
1891 if (!tp) {
1892 return FALSE;
1893 }
1894 if (pcmk__is_fencing_action(op->action)) {
1895
1896
1897 skip_target = TRUE;
1898 }
1899
1900 for (i = 0; i < ST_LEVEL_MAX; i++) {
1901 for (device = tp->levels[i]; device; device = device->next) {
1902 match = NULL;
1903 for (iter = op->query_results; iter && !match; iter = iter->next) {
1904 peer_device_info_t *peer = iter->data;
1905
1906 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1907 continue;
1908 }
1909 match = find_peer_device(op, peer, device->data);
1910 }
1911 if (!match) {
1912 return FALSE;
1913 }
1914 }
1915 }
1916
1917 return TRUE;
1918 }
1919
1920
1921
1922
1923
1924
1925
1926
1927
1928
1929
1930
1931 static void
1932 parse_action_specific(xmlNode *xml, const char *peer, const char *device,
1933 const char *action, remote_fencing_op_t *op,
1934 enum st_remap_phase phase, device_properties_t *props)
1935 {
1936 props->custom_action_timeout[phase] = 0;
1937 crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
1938 &props->custom_action_timeout[phase]);
1939 if (props->custom_action_timeout[phase]) {
1940 crm_trace("Peer %s with device %s returned %s action timeout %d",
1941 peer, device, action, props->custom_action_timeout[phase]);
1942 }
1943
1944 props->delay_max[phase] = 0;
1945 crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
1946 if (props->delay_max[phase]) {
1947 crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
1948 peer, device, props->delay_max[phase], action);
1949 }
1950
1951 props->delay_base[phase] = 0;
1952 crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
1953 if (props->delay_base[phase]) {
1954 crm_trace("Peer %s with device %s returned base delay %d for %s",
1955 peer, device, props->delay_base[phase], action);
1956 }
1957
1958
1959 if (pcmk__str_eq(action, "on", pcmk__str_casei)) {
1960 int required = 0;
1961
1962 crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
1963 if (required) {
1964 crm_trace("Peer %s requires device %s to execute for action %s",
1965 peer, device, action);
1966 add_required_device(op, device);
1967 }
1968 }
1969
1970
1971
1972
1973 if (pcmk__xe_attr_is_true(xml, F_STONITH_ACTION_DISALLOWED)) {
1974 props->disallowed[phase] = TRUE;
1975 crm_trace("Peer %s is disallowed from executing %s for device %s",
1976 peer, action, device);
1977 }
1978 }
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989 static void
1990 add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
1991 peer_device_info_t *peer, const char *device)
1992 {
1993 xmlNode *child;
1994 int verified = 0;
1995 device_properties_t *props = calloc(1, sizeof(device_properties_t));
1996
1997
1998 CRM_ASSERT(props != NULL);
1999 g_hash_table_insert(peer->devices, strdup(device), props);
2000
2001
2002 crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
2003 if (verified) {
2004 crm_trace("Peer %s has confirmed a verified device %s",
2005 peer->host, device);
2006 props->verified = TRUE;
2007 }
2008
2009
2010 parse_action_specific(xml, peer->host, device, op_requested_action(op),
2011 op, st_phase_requested, props);
2012 for (child = pcmk__xml_first_child(xml); child != NULL;
2013 child = pcmk__xml_next(child)) {
2014
2015
2016
2017
2018 if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) {
2019 parse_action_specific(child, peer->host, device, "off",
2020 op, st_phase_off, props);
2021 } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) {
2022 parse_action_specific(child, peer->host, device, "on",
2023 op, st_phase_on, props);
2024 }
2025 }
2026 }
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039 static peer_device_info_t *
2040 add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
2041 {
2042 peer_device_info_t *peer = calloc(1, sizeof(peer_device_info_t));
2043 xmlNode *child;
2044
2045
2046
2047 CRM_CHECK(peer != NULL, return NULL);
2048 peer->host = strdup(host);
2049 peer->devices = pcmk__strkey_table(free, free);
2050
2051
2052 for (child = pcmk__xml_first_child(xml); child != NULL;
2053 child = pcmk__xml_next(child)) {
2054 const char *device = ID(child);
2055
2056 if (device) {
2057 add_device_properties(child, op, peer, device);
2058 }
2059 }
2060
2061 peer->ndevices = g_hash_table_size(peer->devices);
2062 CRM_CHECK(ndevices == peer->ndevices,
2063 crm_err("Query claimed to have %d device%s but %d found",
2064 ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2065
2066 op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2067 return peer;
2068 }
2069
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084 int
2085 process_remote_stonith_query(xmlNode * msg)
2086 {
2087 int ndevices = 0;
2088 gboolean host_is_target = FALSE;
2089 gboolean have_all_replies = FALSE;
2090 const char *id = NULL;
2091 const char *host = NULL;
2092 remote_fencing_op_t *op = NULL;
2093 peer_device_info_t *peer = NULL;
2094 uint32_t replies_expected;
2095 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2096
2097 CRM_CHECK(dev != NULL, return -EPROTO);
2098
2099 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2100 CRM_CHECK(id != NULL, return -EPROTO);
2101
2102 dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
2103 CRM_CHECK(dev != NULL, return -EPROTO);
2104 crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
2105
2106 op = g_hash_table_lookup(stonith_remote_op_list, id);
2107 if (op == NULL) {
2108 crm_debug("Received query reply for unknown or expired operation %s",
2109 id);
2110 return -EOPNOTSUPP;
2111 }
2112
2113 replies_expected = fencing_active_peers();
2114 if (op->replies_expected < replies_expected) {
2115 replies_expected = op->replies_expected;
2116 }
2117 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2118 have_all_replies = TRUE;
2119 }
2120 host = crm_element_value(msg, F_ORIG);
2121 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2122
2123 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2124 op->replies, replies_expected, host,
2125 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2126 if (ndevices > 0) {
2127 peer = add_result(op, host, ndevices, dev);
2128 }
2129
2130 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2131
2132 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2133
2134
2135
2136 if (op->state == st_query && all_topology_devices_found(op)) {
2137
2138 crm_trace("All topology devices found");
2139 request_peer_fencing(op, peer);
2140
2141 } else if (have_all_replies) {
2142 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2143 replies_expected, op->replies);
2144 request_peer_fencing(op, NULL);
2145 }
2146
2147 } else if (op->state == st_query) {
2148 int nverified = count_peer_devices(op, peer, TRUE);
2149
2150
2151
2152 if ((peer != NULL) && !host_is_target && nverified) {
2153
2154 crm_trace("Found %d verified device%s",
2155 nverified, pcmk__plural_s(nverified));
2156 request_peer_fencing(op, peer);
2157
2158 } else if (have_all_replies) {
2159 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2160 replies_expected, op->replies);
2161 request_peer_fencing(op, NULL);
2162
2163 } else {
2164 crm_trace("Waiting for more peer results before launching fencing operation");
2165 }
2166
2167 } else if ((peer != NULL) && (op->state == st_done)) {
2168 crm_info("Discarding query result from %s (%d device%s): "
2169 "Operation is %s", peer->host,
2170 peer->ndevices, pcmk__plural_s(peer->ndevices),
2171 stonith_op_state_str(op->state));
2172 }
2173
2174 return pcmk_ok;
2175 }
2176
2177
2178
2179
2180
2181
2182
2183
2184
2185
2186 void
2187 fenced_process_fencing_reply(xmlNode *msg)
2188 {
2189 const char *id = NULL;
2190 const char *device = NULL;
2191 remote_fencing_op_t *op = NULL;
2192 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2193 pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2194
2195 CRM_CHECK(dev != NULL, return);
2196
2197 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2198 CRM_CHECK(id != NULL, return);
2199
2200 dev = stonith__find_xe_with_result(msg);
2201 CRM_CHECK(dev != NULL, return);
2202
2203 stonith__xe_get_result(dev, &result);
2204
2205 device = crm_element_value(dev, F_STONITH_DEVICE);
2206
2207 if (stonith_remote_op_list) {
2208 op = g_hash_table_lookup(stonith_remote_op_list, id);
2209 }
2210
2211 if ((op == NULL) && pcmk__result_ok(&result)) {
2212
2213 const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2214
2215 op = create_remote_stonith_op(client_id, dev, TRUE);
2216 }
2217
2218 if (op == NULL) {
2219
2220
2221 crm_info("Received peer result of unknown or expired operation %s", id);
2222 pcmk__reset_result(&result);
2223 return;
2224 }
2225
2226 pcmk__reset_result(&op->result);
2227 op->result = result;
2228
2229 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2230 crm_err("Received outdated reply for device %s (instead of %s) to "
2231 "fence (%s) %s. Operation already timed out at peer level.",
2232 device, (const char *) op->devices->data, op->action, op->target);
2233 return;
2234 }
2235
2236 if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2237 if (pcmk__result_ok(&op->result)) {
2238 op->state = st_done;
2239 } else {
2240 op->state = st_failed;
2241 }
2242 finalize_op(op, msg, false);
2243 return;
2244
2245 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2246
2247
2248 crm_err("Received non-broadcast fencing result for operation %.8s "
2249 "we do not own (device %s targeting %s)",
2250 op->id, device, op->target);
2251 return;
2252 }
2253
2254 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2255 const char *device = NULL;
2256 const char *reason = op->result.exit_reason;
2257
2258
2259
2260 if (op->state == st_done) {
2261 finalize_op(op, msg, false);
2262 return;
2263 }
2264
2265 device = crm_element_value(msg, F_STONITH_DEVICE);
2266
2267 if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2268
2269
2270
2271 crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2272 "after successful 'off'",
2273 device, pcmk_exec_status_str(op->result.execution_status),
2274 (reason == NULL)? "" : ": ",
2275 (reason == NULL)? "" : reason,
2276 op->target);
2277 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2278 } else {
2279 crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: "
2280 "%s%s%s%s",
2281 op->action, op->target, device, op->client_name,
2282 op->originator,
2283 pcmk_exec_status_str(op->result.execution_status),
2284 (reason == NULL)? "" : " (",
2285 (reason == NULL)? "" : reason,
2286 (reason == NULL)? "" : ")");
2287 }
2288
2289 if (pcmk__result_ok(&op->result)) {
2290
2291
2292 advance_topology_device_in_level(op, device, msg);
2293 return;
2294 } else {
2295
2296
2297 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2298 op->state = st_failed;
2299 finalize_op(op, msg, false);
2300 return;
2301 }
2302 }
2303
2304 } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2305 op->state = st_done;
2306 finalize_op(op, msg, false);
2307 return;
2308
2309 } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2310 && (op->devices == NULL)) {
2311
2312 op->state = st_failed;
2313 finalize_op(op, msg, false);
2314 return;
2315
2316 } else {
2317
2318 }
2319
2320
2321 crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2322 op->target, op->originator, op->client_name,
2323 pcmk_exec_status_str(op->result.execution_status));
2324 request_peer_fencing(op, NULL);
2325 }
2326
2327 gboolean
2328 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2329 {
2330 GHashTableIter iter;
2331 time_t now = time(NULL);
2332 remote_fencing_op_t *rop = NULL;
2333
2334 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2335 action == NULL) {
2336 return FALSE;
2337 }
2338
2339 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2340 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2341 if (strcmp(rop->target, target) != 0) {
2342 continue;
2343 } else if (rop->state != st_done) {
2344 continue;
2345
2346
2347
2348 } else if (strcmp(rop->action, action) != 0) {
2349 continue;
2350 } else if ((rop->completed + tolerance) < now) {
2351 continue;
2352 }
2353
2354 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2355 target, action, tolerance, rop->delegate, rop->originator);
2356 return TRUE;
2357 }
2358 return FALSE;
2359 }