This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- fencing_result2xml
- fenced_broadcast_op_result
- handle_local_reply_and_notify
- finalize_op_duplicates
- delegate_from_xml
- finalize_op
- remote_op_watchdog_done
- remote_op_timeout_one
- finalize_timed_out_op
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- fenced_handle_manual_confirmation
- create_remote_stonith_op
- initiate_remote_stonith_op
- is_watchdog_fencing
- find_best_peer
- stonith_choose_peer
- valid_fencing_timeout
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- check_watchdog_fencing_and_wait
- request_peer_fencing
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- fenced_process_fencing_reply
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/common/ipc.h>
28 #include <crm/common/ipc_internal.h>
29 #include <crm/cluster/internal.h>
30
31 #include <crm/stonith-ng.h>
32 #include <crm/fencing/internal.h>
33 #include <crm/common/xml.h>
34 #include <crm/common/xml_internal.h>
35
36 #include <crm/common/util.h>
37 #include <pacemaker-fenced.h>
38
39 #define TIMEOUT_MULTIPLY_FACTOR 1.2
40
41
42
43
44
45
46
47 typedef struct device_properties_s {
48
49 gboolean verified;
50
51
52
53
54 gboolean executed[st_phase_max];
55
56 gboolean disallowed[st_phase_max];
57
58 int custom_action_timeout[st_phase_max];
59
60 int delay_max[st_phase_max];
61
62 int delay_base[st_phase_max];
63
64 uint32_t device_support_flags;
65 } device_properties_t;
66
67 typedef struct {
68
69 char *host;
70
71 gboolean tried;
72
73 int ndevices;
74
75 GHashTable *devices;
76 } peer_device_info_t;
77
78 GHashTable *stonith_remote_op_list = NULL;
79
80 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
81 int call_options);
82
83 static void request_peer_fencing(remote_fencing_op_t *op,
84 peer_device_info_t *peer);
85 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
86 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
87 static int get_op_total_timeout(const remote_fencing_op_t *op,
88 const peer_device_info_t *chosen_peer);
89
90 static gint
91 sort_strings(gconstpointer a, gconstpointer b)
92 {
93 return strcmp(a, b);
94 }
95
96 static void
97 free_remote_query(gpointer data)
98 {
99 if (data != NULL) {
100 peer_device_info_t *peer = data;
101
102 g_hash_table_destroy(peer->devices);
103 free(peer->host);
104 free(peer);
105 }
106 }
107
108 void
109 free_stonith_remote_op_list(void)
110 {
111 if (stonith_remote_op_list != NULL) {
112 g_hash_table_destroy(stonith_remote_op_list);
113 stonith_remote_op_list = NULL;
114 }
115 }
116
117 struct peer_count_data {
118 const remote_fencing_op_t *op;
119 gboolean verified_only;
120 uint32_t support_action_only;
121 int count;
122 };
123
124
125
126
127
128
129
130
131
132 static void
133 count_peer_device(gpointer key, gpointer value, gpointer user_data)
134 {
135 device_properties_t *props = (device_properties_t*)value;
136 struct peer_count_data *data = user_data;
137
138 if (!props->executed[data->op->phase]
139 && (!data->verified_only || props->verified)
140 && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
141 ++(data->count);
142 }
143 }
144
145
146
147
148
149
150
151
152
153
154
155
156 static int
157 count_peer_devices(const remote_fencing_op_t *op,
158 const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
159 {
160 struct peer_count_data data;
161
162 data.op = op;
163 data.verified_only = verified_only;
164 data.support_action_only = support_on_action_only;
165 data.count = 0;
166 if (peer) {
167 g_hash_table_foreach(peer->devices, count_peer_device, &data);
168 }
169 return data.count;
170 }
171
172
173
174
175
176
177
178
179
180
181
182 static device_properties_t *
183 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
184 const char *device, uint32_t support_action_only)
185 {
186 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
187
188 if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
189 return NULL;
190 }
191 return (props && !props->executed[op->phase]
192 && !props->disallowed[op->phase])? props : NULL;
193 }
194
195
196
197
198
199
200
201
202
203
204
205
206 static gboolean
207 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
208 const char *device, gboolean verified_devices_only)
209 {
210 device_properties_t *props = find_peer_device(op, peer, device,
211 fenced_support_flag(op->action));
212
213 if ((props == NULL) || (verified_devices_only && !props->verified)) {
214 return FALSE;
215 }
216
217 crm_trace("Removing %s from %s (%d remaining)",
218 device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
219 props->executed[op->phase] = TRUE;
220 return TRUE;
221 }
222
223 static void
224 clear_remote_op_timers(remote_fencing_op_t * op)
225 {
226 if (op->query_timer) {
227 g_source_remove(op->query_timer);
228 op->query_timer = 0;
229 }
230 if (op->op_timer_total) {
231 g_source_remove(op->op_timer_total);
232 op->op_timer_total = 0;
233 }
234 if (op->op_timer_one) {
235 g_source_remove(op->op_timer_one);
236 op->op_timer_one = 0;
237 }
238 }
239
240 static void
241 free_remote_op(gpointer data)
242 {
243 remote_fencing_op_t *op = data;
244
245 crm_log_xml_debug(op->request, "Destroying");
246
247 clear_remote_op_timers(op);
248
249 free(op->id);
250 free(op->action);
251 free(op->delegate);
252 free(op->target);
253 free(op->client_id);
254 free(op->client_name);
255 free(op->originator);
256
257 if (op->query_results) {
258 g_list_free_full(op->query_results, free_remote_query);
259 }
260 if (op->request) {
261 free_xml(op->request);
262 op->request = NULL;
263 }
264 if (op->devices_list) {
265 g_list_free_full(op->devices_list, free);
266 op->devices_list = NULL;
267 }
268 g_list_free_full(op->automatic_list, free);
269 g_list_free(op->duplicates);
270
271 pcmk__reset_result(&op->result);
272 free(op);
273 }
274
275 void
276 init_stonith_remote_op_hash_table(GHashTable **table)
277 {
278 if (*table == NULL) {
279 *table = pcmk__strkey_table(NULL, free_remote_op);
280 }
281 }
282
283
284
285
286
287
288
289
290
291 static const char *
292 op_requested_action(const remote_fencing_op_t *op)
293 {
294 return ((op->phase > st_phase_requested)? PCMK_ACTION_REBOOT : op->action);
295 }
296
297
298
299
300
301
302
303 static void
304 op_phase_off(remote_fencing_op_t *op)
305 {
306 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
307 CRM_XS " id=%.8s", op->target, op->id);
308 op->phase = st_phase_off;
309
310
311
312
313 strcpy(op->action, PCMK_ACTION_OFF);
314 }
315
316
317
318
319
320
321
322 static void
323 op_phase_on(remote_fencing_op_t *op)
324 {
325 GList *iter = NULL;
326
327 crm_info("Remapped 'off' targeting %s complete, "
328 "remapping to 'on' for %s " CRM_XS " id=%.8s",
329 op->target, op->client_name, op->id);
330 op->phase = st_phase_on;
331 strcpy(op->action, PCMK_ACTION_ON);
332
333
334
335
336 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
337 GList *match = g_list_find_custom(op->devices_list, iter->data,
338 sort_strings);
339
340 if (match) {
341 op->devices_list = g_list_remove(op->devices_list, match->data);
342 }
343 }
344 g_list_free_full(op->automatic_list, free);
345 op->automatic_list = NULL;
346
347
348 op->devices = op->devices_list;
349 }
350
351
352
353
354
355
356
357 static void
358 undo_op_remap(remote_fencing_op_t *op)
359 {
360 if (op->phase > 0) {
361 crm_info("Undoing remap of reboot targeting %s for %s "
362 CRM_XS " id=%.8s", op->target, op->client_name, op->id);
363 op->phase = st_phase_requested;
364 strcpy(op->action, PCMK_ACTION_REBOOT);
365 }
366 }
367
368
369
370
371
372
373
374
375
376
377
378 static xmlNode *
379 fencing_result2xml(xmlNode *parent, const remote_fencing_op_t *op)
380 {
381 xmlNode *notify_data = pcmk__xe_create(parent, PCMK__XE_ST_NOTIFY_FENCE);
382
383 crm_xml_add_int(notify_data, PCMK_XA_STATE, op->state);
384 crm_xml_add(notify_data, PCMK__XA_ST_TARGET, op->target);
385 crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ACTION, op->action);
386 crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, op->delegate);
387 crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, op->id);
388 crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, op->originator);
389 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTID, op->client_id);
390 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTNAME, op->client_name);
391
392 return notify_data;
393 }
394
395
396
397
398
399
400
401
402 void
403 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
404 {
405 static int count = 0;
406 xmlNode *bcast = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY);
407 xmlNode *wrapper = NULL;
408 xmlNode *notify_data = NULL;
409
410 count++;
411 crm_trace("Broadcasting result to peers");
412 crm_xml_add(bcast, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY);
413 crm_xml_add(bcast, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST);
414 crm_xml_add(bcast, PCMK__XA_ST_OP, STONITH_OP_NOTIFY);
415 crm_xml_add_int(bcast, PCMK_XA_COUNT, count);
416
417 if (op_merged) {
418 pcmk__xe_set_bool_attr(bcast, PCMK__XA_ST_OP_MERGED, true);
419 }
420
421 wrapper = pcmk__xe_create(bcast, PCMK__XE_ST_CALLDATA);
422 notify_data = fencing_result2xml(wrapper, op);
423 stonith__xe_set_result(notify_data, &op->result);
424
425 pcmk__cluster_send_message(NULL, crm_msg_stonith_ng, bcast);
426 free_xml(bcast);
427
428 return;
429 }
430
431
432
433
434
435
436
437
438 static void
439 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
440 {
441 xmlNode *notify_data = NULL;
442 xmlNode *reply = NULL;
443 pcmk__client_t *client = NULL;
444
445 if (op->notify_sent == TRUE) {
446
447 return;
448 }
449
450
451 crm_xml_add_int(data, PCMK_XA_STATE, op->state);
452 crm_xml_add(data, PCMK__XA_ST_TARGET, op->target);
453 crm_xml_add(data, PCMK__XA_ST_OP, op->action);
454
455 reply = fenced_construct_reply(op->request, data, &op->result);
456 crm_xml_add(reply, PCMK__XA_ST_DELEGATE, op->delegate);
457
458
459 client = pcmk__find_client_by_id(op->client_id);
460 if (client == NULL) {
461 crm_trace("Skipping reply to %s: no longer a client", op->client_id);
462 } else {
463 do_local_reply(reply, client, op->call_options);
464 }
465
466
467 notify_data = fencing_result2xml(NULL, op);
468 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, &op->result,
469 notify_data);
470 free_xml(notify_data);
471 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
472
473
474 op->notify_sent = TRUE;
475 free_xml(reply);
476 }
477
478
479
480
481
482
483
484
485 static void
486 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
487 {
488 for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
489 remote_fencing_op_t *other = iter->data;
490
491 if (other->state == st_duplicate) {
492 other->state = op->state;
493 crm_debug("Performing duplicate notification for %s@%s: %s "
494 CRM_XS " id=%.8s",
495 other->client_name, other->originator,
496 pcmk_exec_status_str(op->result.execution_status),
497 other->id);
498 pcmk__copy_result(&op->result, &other->result);
499 finalize_op(other, data, true);
500
501 } else {
502
503 crm_err("Skipping duplicate notification for %s@%s "
504 CRM_XS " state=%s id=%.8s",
505 other->client_name, other->originator,
506 stonith_op_state_str(other->state), other->id);
507 }
508 }
509 }
510
511 static char *
512 delegate_from_xml(xmlNode *xml)
513 {
514 xmlNode *match = get_xpath_object("//@" PCMK__XA_ST_DELEGATE, xml,
515 LOG_NEVER);
516
517 if (match == NULL) {
518 return crm_element_value_copy(xml, PCMK__XA_SRC);
519 } else {
520 return crm_element_value_copy(match, PCMK__XA_ST_DELEGATE);
521 }
522 }
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540 static void
541 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
542 {
543 int level = LOG_ERR;
544 const char *subt = NULL;
545 xmlNode *local_data = NULL;
546 gboolean op_merged = FALSE;
547
548 CRM_CHECK((op != NULL), return);
549
550
551 clear_remote_op_timers(op);
552
553 if (op->notify_sent) {
554
555 crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
556 "Result arrived too late " CRM_XS " id=%.8s",
557 op->action, (op->target? " targeting " : ""),
558 (op->target? op->target : ""),
559 (op->delegate? op->delegate : "unknown node"),
560 op->client_name, op->originator,
561 (op_merged? " (merged)" : ""),
562 op->id);
563 return;
564 }
565
566 set_fencing_completed(op);
567 undo_op_remap(op);
568
569 if (data == NULL) {
570 data = pcmk__xe_create(NULL, "remote-op");
571 local_data = data;
572
573 } else if (op->delegate == NULL) {
574 switch (op->result.execution_status) {
575 case PCMK_EXEC_NO_FENCE_DEVICE:
576 break;
577
578 case PCMK_EXEC_INVALID:
579 if (op->result.exit_status != CRM_EX_EXPIRED) {
580 op->delegate = delegate_from_xml(data);
581 }
582 break;
583
584 default:
585 op->delegate = delegate_from_xml(data);
586 break;
587 }
588 }
589
590 if (dup || (crm_element_value(data, PCMK__XA_ST_OP_MERGED) != NULL)) {
591 op_merged = true;
592 }
593
594
595
596
597 subt = crm_element_value(data, PCMK__XA_SUBT);
598 if (!dup && !pcmk__str_eq(subt, PCMK__VALUE_BROADCAST, pcmk__str_none)) {
599
600 fenced_broadcast_op_result(op, op_merged);
601 free_xml(local_data);
602 return;
603 }
604
605 if (pcmk__result_ok(&op->result) || dup
606 || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
607 level = LOG_NOTICE;
608 }
609 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
610 CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
611 (op->target? op->target : ""),
612 (op->delegate? op->delegate : "unknown node"),
613 op->client_name, op->originator,
614 (op_merged? " (merged)" : ""),
615 crm_exit_str(op->result.exit_status),
616 pcmk_exec_status_str(op->result.execution_status),
617 ((op->result.exit_reason == NULL)? "" : ": "),
618 ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
619 op->id);
620
621 handle_local_reply_and_notify(op, data);
622
623 if (!dup) {
624 finalize_op_duplicates(op, data);
625 }
626
627
628
629
630 if (op->query_results) {
631 g_list_free_full(op->query_results, free_remote_query);
632 op->query_results = NULL;
633 }
634 if (op->request) {
635 free_xml(op->request);
636 op->request = NULL;
637 }
638
639 free_xml(local_data);
640 }
641
642
643
644
645
646
647
648
649
650 static gboolean
651 remote_op_watchdog_done(gpointer userdata)
652 {
653 remote_fencing_op_t *op = userdata;
654
655 op->op_timer_one = 0;
656
657 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
658 CRM_XS " id=%.8s",
659 op->action, op->target, op->client_name, op->id);
660 op->state = st_done;
661 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
662 finalize_op(op, NULL, false);
663 return G_SOURCE_REMOVE;
664 }
665
666 static gboolean
667 remote_op_timeout_one(gpointer userdata)
668 {
669 remote_fencing_op_t *op = userdata;
670
671 op->op_timer_one = 0;
672
673 crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
674 " id=%.8s", op->action, op->target, op->client_name, op->id);
675 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
676 "Peer did not return fence result within timeout");
677
678
679 if (op->client_delay > 0) {
680 op->client_delay = 0;
681 crm_trace("Try another device for '%s' action targeting %s "
682 "for client %s without delay " CRM_XS " id=%.8s",
683 op->action, op->target, op->client_name, op->id);
684 }
685
686
687 request_peer_fencing(op, NULL);
688 return G_SOURCE_REMOVE;
689 }
690
691
692
693
694
695
696
697
698 static void
699 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
700 {
701 crm_debug("Action '%s' targeting %s for client %s timed out "
702 CRM_XS " id=%.8s",
703 op->action, op->target, op->client_name, op->id);
704
705 if (op->phase == st_phase_on) {
706
707
708
709
710 op->state = st_done;
711 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
712 } else {
713 op->state = st_failed;
714 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
715 }
716 finalize_op(op, NULL, false);
717 }
718
719
720
721
722
723
724
725
726
727 static gboolean
728 remote_op_timeout(gpointer userdata)
729 {
730 remote_fencing_op_t *op = userdata;
731
732 op->op_timer_total = 0;
733
734 if (op->state == st_done) {
735 crm_debug("Action '%s' targeting %s for client %s already completed "
736 CRM_XS " id=%.8s",
737 op->action, op->target, op->client_name, op->id);
738 } else {
739 finalize_timed_out_op(userdata, "Fencing did not complete within a "
740 "total timeout based on the "
741 "configured timeout and retries for "
742 "any devices attempted");
743 }
744 return G_SOURCE_REMOVE;
745 }
746
747 static gboolean
748 remote_op_query_timeout(gpointer data)
749 {
750 remote_fencing_op_t *op = data;
751
752 op->query_timer = 0;
753
754 if (op->state == st_done) {
755 crm_debug("Operation %.8s targeting %s already completed",
756 op->id, op->target);
757 } else if (op->state == st_exec) {
758 crm_debug("Operation %.8s targeting %s already in progress",
759 op->id, op->target);
760 } else if (op->query_results) {
761
762 crm_debug("Query %.8s targeting %s complete (state=%s)",
763 op->id, op->target, stonith_op_state_str(op->state));
764 request_peer_fencing(op, NULL);
765 } else {
766 crm_debug("Query %.8s targeting %s timed out (state=%s)",
767 op->id, op->target, stonith_op_state_str(op->state));
768 finalize_timed_out_op(op, "No capable peers replied to device query "
769 "within timeout");
770 }
771
772 return G_SOURCE_REMOVE;
773 }
774
775 static gboolean
776 topology_is_empty(stonith_topology_t *tp)
777 {
778 int i;
779
780 if (tp == NULL) {
781 return TRUE;
782 }
783
784 for (i = 0; i < ST__LEVEL_COUNT; i++) {
785 if (tp->levels[i] != NULL) {
786 return FALSE;
787 }
788 }
789 return TRUE;
790 }
791
792
793
794
795
796
797
798
799 static void
800 add_required_device(remote_fencing_op_t *op, const char *device)
801 {
802 GList *match = g_list_find_custom(op->automatic_list, device,
803 sort_strings);
804
805 if (!match) {
806 op->automatic_list = g_list_prepend(op->automatic_list,
807 pcmk__str_copy(device));
808 }
809 }
810
811
812
813
814
815
816
817
818 static void
819 remove_required_device(remote_fencing_op_t *op, const char *device)
820 {
821 GList *match = g_list_find_custom(op->automatic_list, device,
822 sort_strings);
823
824 if (match) {
825 op->automatic_list = g_list_remove(op->automatic_list, match->data);
826 }
827 }
828
829
830 static void
831 set_op_device_list(remote_fencing_op_t * op, GList *devices)
832 {
833 GList *lpc = NULL;
834
835 if (op->devices_list) {
836 g_list_free_full(op->devices_list, free);
837 op->devices_list = NULL;
838 }
839 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
840 const char *device = lpc->data;
841
842 op->devices_list = g_list_append(op->devices_list,
843 pcmk__str_copy(device));
844 }
845 op->devices = op->devices_list;
846 }
847
848
849
850
851
852
853
854
855
856
857 static gboolean
858 topology_matches(const stonith_topology_t *tp, const char *node)
859 {
860 regex_t r_patt;
861
862 CRM_CHECK(node && tp && tp->target, return FALSE);
863 switch (tp->kind) {
864 case fenced_target_by_attribute:
865
866
867
868
869
870
871 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
872 crm_notice("Matched %s with %s by attribute", node, tp->target);
873 return TRUE;
874 }
875 break;
876
877 case fenced_target_by_pattern:
878
879
880
881 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
882 crm_info("Bad regex '%s' for fencing level", tp->target);
883 } else {
884 int status = regexec(&r_patt, node, 0, NULL, 0);
885
886 regfree(&r_patt);
887 if (status == 0) {
888 crm_notice("Matched %s with %s by name", node, tp->target);
889 return TRUE;
890 }
891 }
892 break;
893
894 case fenced_target_by_name:
895 crm_trace("Testing %s against %s", node, tp->target);
896 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
897
898 default:
899 break;
900 }
901 crm_trace("No match for %s with %s", node, tp->target);
902 return FALSE;
903 }
904
905 stonith_topology_t *
906 find_topology_for_host(const char *host)
907 {
908 GHashTableIter tIter;
909 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
910
911 if(tp != NULL) {
912 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
913 return tp;
914 }
915
916 g_hash_table_iter_init(&tIter, topology);
917 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
918 if (topology_matches(tp, host)) {
919 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
920 return tp;
921 }
922 }
923
924 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
925 return NULL;
926 }
927
928
929
930
931
932
933
934
935
936
937
938
939 static int
940 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
941 {
942 stonith_topology_t *tp = NULL;
943
944 if (op->target) {
945 tp = find_topology_for_host(op->target);
946 }
947 if (topology_is_empty(tp)) {
948 return empty_ok? pcmk_rc_ok : ENODEV;
949 }
950
951 pcmk__assert(tp->levels != NULL);
952
953 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
954
955
956 undo_op_remap(op);
957
958 do {
959 op->level++;
960
961 } while (op->level < ST__LEVEL_COUNT && tp->levels[op->level] == NULL);
962
963 if (op->level < ST__LEVEL_COUNT) {
964 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
965 "for client %s@%s (id=%.8s)",
966 op->level, op->target, g_list_length(tp->levels[op->level]),
967 op->client_name, op->originator, op->id);
968 set_op_device_list(op, tp->levels[op->level]);
969
970
971 if ((op->level > 1) && (op->client_delay > 0)) {
972 op->client_delay = 0;
973 }
974
975 if ((g_list_next(op->devices_list) != NULL)
976 && pcmk__str_eq(op->action, PCMK_ACTION_REBOOT, pcmk__str_none)) {
977
978
979
980
981
982 op_phase_off(op);
983 }
984 return pcmk_rc_ok;
985 }
986
987 crm_info("All %sfencing options targeting %s for client %s@%s failed "
988 CRM_XS " id=%.8s",
989 (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
990 op->target, op->client_name, op->originator, op->id);
991 return ENODEV;
992 }
993
994
995
996
997
998
999
1000 static void
1001 merge_duplicates(remote_fencing_op_t *op)
1002 {
1003 GHashTableIter iter;
1004 remote_fencing_op_t *other = NULL;
1005
1006 time_t now = time(NULL);
1007
1008 g_hash_table_iter_init(&iter, stonith_remote_op_list);
1009 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
1010 const char *other_action = op_requested_action(other);
1011 crm_node_t *node = NULL;
1012
1013 if (!strcmp(op->id, other->id)) {
1014 continue;
1015 }
1016 if (other->state > st_exec) {
1017 crm_trace("%.8s not duplicate of %.8s: not in progress",
1018 op->id, other->id);
1019 continue;
1020 }
1021 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1022 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1023 op->id, other->id, op->target, other->target);
1024 continue;
1025 }
1026 if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1027 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1028 op->id, other->id, op->action, other_action);
1029 continue;
1030 }
1031 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1032 crm_trace("%.8s not duplicate of %.8s: same client %s",
1033 op->id, other->id, op->client_name);
1034 continue;
1035 }
1036 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1037 crm_trace("%.8s not duplicate of %.8s: self-fencing for %s",
1038 op->id, other->id, other->target);
1039 continue;
1040 }
1041
1042 node = pcmk__get_node(0, other->originator, NULL,
1043 pcmk__node_search_cluster_member);
1044
1045 if (!fencing_peer_active(node)) {
1046 crm_notice("Failing action '%s' targeting %s originating from "
1047 "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
1048 other->action, other->target, other->client_name,
1049 other->originator, other->id);
1050 crm_trace("%.8s not duplicate of %.8s: originator dead",
1051 op->id, other->id);
1052 other->state = st_failed;
1053 continue;
1054 }
1055 if ((other->total_timeout > 0)
1056 && (now > (other->total_timeout + other->created))) {
1057 crm_trace("%.8s not duplicate of %.8s: old (%lld vs. %lld + %ds)",
1058 op->id, other->id, (long long)now, (long long)other->created,
1059 other->total_timeout);
1060 continue;
1061 }
1062
1063
1064
1065
1066 other->duplicates = g_list_append(other->duplicates, op);
1067 if (other->total_timeout == 0) {
1068 other->total_timeout = op->total_timeout =
1069 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1070 crm_trace("Best guess as to timeout used for %.8s: %ds",
1071 other->id, other->total_timeout);
1072 }
1073 crm_notice("Merging fencing action '%s' targeting %s originating from "
1074 "client %s with identical request from %s@%s "
1075 CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1076 op->action, op->target, op->client_name,
1077 other->client_name, other->originator,
1078 op->id, other->id, other->total_timeout);
1079 report_timeout_period(op, other->total_timeout);
1080 op->state = st_duplicate;
1081 }
1082 }
1083
1084 static uint32_t fencing_active_peers(void)
1085 {
1086 uint32_t count = 0;
1087 crm_node_t *entry;
1088 GHashTableIter gIter;
1089
1090 g_hash_table_iter_init(&gIter, crm_peer_cache);
1091 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1092 if(fencing_peer_active(entry)) {
1093 count++;
1094 }
1095 }
1096 return count;
1097 }
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108 int
1109 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
1110 {
1111 remote_fencing_op_t *op = NULL;
1112 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, msg, LOG_ERR);
1113
1114 CRM_CHECK(dev != NULL, return EPROTO);
1115
1116 crm_notice("Received manual confirmation that %s has been fenced",
1117 pcmk__s(crm_element_value(dev, PCMK__XA_ST_TARGET),
1118 "unknown target"));
1119 op = initiate_remote_stonith_op(client, msg, TRUE);
1120 if (op == NULL) {
1121 return EPROTO;
1122 }
1123 op->state = st_done;
1124 set_fencing_completed(op);
1125 op->delegate = pcmk__str_copy("a human");
1126
1127
1128 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1129 finalize_op(op, msg, false);
1130
1131
1132
1133
1134 return EINPROGRESS;
1135 }
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148 void *
1149 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
1150 {
1151 remote_fencing_op_t *op = NULL;
1152 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request,
1153 LOG_NEVER);
1154 int rc = pcmk_rc_ok;
1155 const char *operation = NULL;
1156
1157 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1158
1159
1160
1161 if (peer && dev) {
1162 const char *op_id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
1163
1164 CRM_CHECK(op_id != NULL, return NULL);
1165
1166 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1167 if (op) {
1168 crm_debug("Reusing existing remote fencing op %.8s for %s",
1169 op_id, ((client == NULL)? "unknown client" : client));
1170 return op;
1171 }
1172 }
1173
1174 op = pcmk__assert_alloc(1, sizeof(remote_fencing_op_t));
1175
1176 crm_element_value_int(request, PCMK__XA_ST_TIMEOUT, &(op->base_timeout));
1177
1178 crm_element_value_int(request, PCMK__XA_ST_DELAY, &(op->client_delay));
1179
1180 if (peer && dev) {
1181 op->id = crm_element_value_copy(dev, PCMK__XA_ST_REMOTE_OP);
1182 } else {
1183 op->id = crm_generate_uuid();
1184 }
1185
1186 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1187
1188 op->state = st_query;
1189 op->replies_expected = fencing_active_peers();
1190 op->action = crm_element_value_copy(dev, PCMK__XA_ST_DEVICE_ACTION);
1191
1192
1193
1194
1195
1196
1197
1198 op->originator = crm_element_value_copy(dev, PCMK__XA_ST_ORIGIN);
1199 if (op->originator == NULL) {
1200
1201 op->originator = pcmk__str_copy(stonith_our_uname);
1202 }
1203
1204
1205 op->delegate = crm_element_value_copy(dev, PCMK__XA_ST_DELEGATE);
1206 op->created = time(NULL);
1207
1208 CRM_LOG_ASSERT(client != NULL);
1209 op->client_id = pcmk__str_copy(client);
1210
1211
1212 operation = crm_element_value(request, PCMK__XA_ST_OP);
1213
1214 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1215 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1216 (unsigned long) getpid());
1217 } else {
1218 op->client_name = crm_element_value_copy(request,
1219 PCMK__XA_ST_CLIENTNAME);
1220 }
1221
1222 op->target = crm_element_value_copy(dev, PCMK__XA_ST_TARGET);
1223
1224
1225 op->request = pcmk__xml_copy(NULL, request);
1226
1227 rc = pcmk__xe_get_flags(request, PCMK__XA_ST_CALLOPT, &(op->call_options),
1228 0U);
1229 if (rc != pcmk_rc_ok) {
1230 crm_warn("Couldn't parse options from request %s: %s",
1231 op->id, pcmk_rc_str(rc));
1232 }
1233
1234 crm_element_value_int(request, PCMK__XA_ST_CALLID, &(op->client_callid));
1235
1236 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1237 "base timeout %ds, %u %s expected)",
1238 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1239 op->target, op->client_name, op->base_timeout,
1240 op->replies_expected,
1241 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1242
1243 if (op->call_options & st_opt_cs_nodeid) {
1244 int nodeid;
1245 crm_node_t *node;
1246
1247 pcmk__scan_min_int(op->target, &nodeid, 0);
1248 node = pcmk__search_node_caches(nodeid, NULL,
1249 pcmk__node_search_any
1250 |pcmk__node_search_cluster_cib);
1251
1252
1253 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1254
1255 if (node && node->uname) {
1256 pcmk__str_update(&(op->target), node->uname);
1257
1258 } else {
1259 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1260 }
1261 }
1262
1263
1264 merge_duplicates(op);
1265
1266 if (op->state != st_duplicate) {
1267
1268 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
1269 }
1270
1271
1272 stonith_fence_history_trim();
1273
1274 return op;
1275 }
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287 remote_fencing_op_t *
1288 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
1289 gboolean manual_ack)
1290 {
1291 int query_timeout = 0;
1292 xmlNode *query = NULL;
1293 const char *client_id = NULL;
1294 remote_fencing_op_t *op = NULL;
1295 const char *relay_op_id = NULL;
1296 const char *operation = NULL;
1297
1298 if (client) {
1299 client_id = client->id;
1300 } else {
1301 client_id = crm_element_value(request, PCMK__XA_ST_CLIENTID);
1302 }
1303
1304 CRM_LOG_ASSERT(client_id != NULL);
1305 op = create_remote_stonith_op(client_id, request, FALSE);
1306 op->owner = TRUE;
1307 if (manual_ack) {
1308 return op;
1309 }
1310
1311 CRM_CHECK(op->action, return NULL);
1312
1313 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1314 op->state = st_failed;
1315 }
1316
1317 switch (op->state) {
1318 case st_failed:
1319
1320 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1321 "All topology levels failed");
1322 crm_warn("Could not request peer fencing (%s) targeting %s "
1323 CRM_XS " id=%.8s", op->action, op->target, op->id);
1324 finalize_op(op, NULL, false);
1325 return op;
1326
1327 case st_duplicate:
1328 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1329 CRM_XS " id=%.8s", op->action, op->target, op->id);
1330 return op;
1331
1332 default:
1333 crm_notice("Requesting peer fencing (%s) targeting %s "
1334 CRM_XS " id=%.8s state=%s base_timeout=%ds",
1335 op->action, op->target, op->id,
1336 stonith_op_state_str(op->state), op->base_timeout);
1337 }
1338
1339 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1340 NULL, op->call_options);
1341
1342 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP, op->id);
1343 crm_xml_add(query, PCMK__XA_ST_TARGET, op->target);
1344 crm_xml_add(query, PCMK__XA_ST_DEVICE_ACTION, op_requested_action(op));
1345 crm_xml_add(query, PCMK__XA_ST_ORIGIN, op->originator);
1346 crm_xml_add(query, PCMK__XA_ST_CLIENTID, op->client_id);
1347 crm_xml_add(query, PCMK__XA_ST_CLIENTNAME, op->client_name);
1348 crm_xml_add_int(query, PCMK__XA_ST_TIMEOUT, op->base_timeout);
1349
1350
1351 operation = crm_element_value(request, PCMK__XA_ST_OP);
1352 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1353 relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP);
1354 if (relay_op_id) {
1355 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP_RELAY, relay_op_id);
1356 }
1357 }
1358
1359 pcmk__cluster_send_message(NULL, crm_msg_stonith_ng, query);
1360 free_xml(query);
1361
1362 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1363 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1364
1365 return op;
1366 }
1367
1368 enum find_best_peer_options {
1369
1370 FIND_PEER_SKIP_TARGET = 0x0001,
1371
1372 FIND_PEER_TARGET_ONLY = 0x0002,
1373
1374 FIND_PEER_VERIFIED_ONLY = 0x0004,
1375 };
1376
1377 static bool
1378 is_watchdog_fencing(const remote_fencing_op_t *op, const char *device)
1379 {
1380 return (stonith_watchdog_timeout_ms > 0
1381
1382 && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_null_matches)
1383 && pcmk__is_fencing_action(op->action)
1384 && node_does_watchdog_fencing(op->target));
1385 }
1386
1387 static peer_device_info_t *
1388 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1389 {
1390 GList *iter = NULL;
1391 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1392
1393 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1394 return NULL;
1395 }
1396
1397 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1398 peer_device_info_t *peer = iter->data;
1399
1400 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1401 peer->host, op->target, peer->ndevices,
1402 pcmk__plural_s(peer->ndevices), peer->tried, options);
1403 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1404 continue;
1405 }
1406 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1407 continue;
1408 }
1409
1410 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1411
1412 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1413 return peer;
1414 }
1415
1416 } else if (!peer->tried
1417 && count_peer_devices(op, peer, verified_devices_only,
1418 fenced_support_flag(op->action))) {
1419
1420 crm_trace("Simple fencing");
1421 return peer;
1422 }
1423 }
1424
1425 return NULL;
1426 }
1427
1428 static peer_device_info_t *
1429 stonith_choose_peer(remote_fencing_op_t * op)
1430 {
1431 const char *device = NULL;
1432 peer_device_info_t *peer = NULL;
1433 uint32_t active = fencing_active_peers();
1434
1435 do {
1436 if (op->devices) {
1437 device = op->devices->data;
1438 crm_trace("Checking for someone to fence (%s) %s using %s",
1439 op->action, op->target, device);
1440 } else {
1441 crm_trace("Checking for someone to fence (%s) %s",
1442 op->action, op->target);
1443 }
1444
1445
1446 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1447 if (peer) {
1448 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1449 return peer;
1450 }
1451
1452 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1453 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1454 return NULL;
1455 }
1456
1457
1458 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1459 if (peer) {
1460 crm_trace("Found best unverified peer %s", peer->host);
1461 return peer;
1462 }
1463
1464
1465
1466
1467 if (op->phase != st_phase_on) {
1468 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1469 if (peer) {
1470 crm_trace("%s will fence itself", peer->host);
1471 return peer;
1472 }
1473 }
1474
1475
1476
1477
1478 } while ((op->phase != st_phase_on)
1479 && pcmk_is_set(op->call_options, st_opt_topology)
1480 && (advance_topology_level(op, false) == pcmk_rc_ok));
1481
1482
1483
1484
1485 if (is_watchdog_fencing(op, device)) {
1486 crm_info("Couldn't contact watchdog-fencing target-node (%s)",
1487 op->target);
1488
1489 } else {
1490 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1491 op->action, op->target, (device? device : "any device"));
1492 }
1493 return NULL;
1494 }
1495
1496 static int
1497 valid_fencing_timeout(int specified_timeout, bool action_specific,
1498 const remote_fencing_op_t *op, const char *device)
1499 {
1500 int timeout = specified_timeout;
1501
1502 if (!is_watchdog_fencing(op, device)) {
1503 return timeout;
1504 }
1505
1506 timeout = (int) QB_MIN(QB_MAX(specified_timeout,
1507 stonith_watchdog_timeout_ms / 1000), INT_MAX);
1508
1509 if (timeout > specified_timeout) {
1510 if (action_specific) {
1511 crm_warn("pcmk_%s_timeout %ds for %s is too short (must be >= "
1512 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1513 "instead",
1514 op->action, specified_timeout, device? device : "watchdog",
1515 timeout, timeout);
1516
1517 } else {
1518 crm_warn("Fencing timeout %ds is too short (must be >= "
1519 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1520 "instead",
1521 specified_timeout, timeout, timeout);
1522 }
1523 }
1524
1525 return timeout;
1526 }
1527
1528 static int
1529 get_device_timeout(const remote_fencing_op_t *op,
1530 const peer_device_info_t *peer, const char *device,
1531 bool with_delay)
1532 {
1533 int timeout = op->base_timeout;
1534 device_properties_t *props;
1535
1536 timeout = valid_fencing_timeout(op->base_timeout, false, op, device);
1537
1538 if (!peer || !device) {
1539 return timeout;
1540 }
1541
1542 props = g_hash_table_lookup(peer->devices, device);
1543 if (!props) {
1544 return timeout;
1545 }
1546
1547 if (props->custom_action_timeout[op->phase]) {
1548 timeout = valid_fencing_timeout(props->custom_action_timeout[op->phase],
1549 true, op, device);
1550 }
1551
1552
1553 if (with_delay && (op->client_delay >= 0)) {
1554
1555 timeout += (props->delay_max[op->phase] > 0 ?
1556 props->delay_max[op->phase] : props->delay_base[op->phase]);
1557 }
1558
1559 return timeout;
1560 }
1561
1562 struct timeout_data {
1563 const remote_fencing_op_t *op;
1564 const peer_device_info_t *peer;
1565 int total_timeout;
1566 };
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576 static void
1577 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1578 {
1579 const char *device_id = key;
1580 device_properties_t *props = value;
1581 struct timeout_data *timeout = user_data;
1582
1583 if (!props->executed[timeout->op->phase]
1584 && !props->disallowed[timeout->op->phase]) {
1585 timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer,
1586 device_id, true);
1587 }
1588 }
1589
1590 static int
1591 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
1592 {
1593 struct timeout_data timeout;
1594
1595 timeout.op = op;
1596 timeout.peer = peer;
1597 timeout.total_timeout = 0;
1598
1599 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1600
1601 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1602 }
1603
1604 static int
1605 get_op_total_timeout(const remote_fencing_op_t *op,
1606 const peer_device_info_t *chosen_peer)
1607 {
1608 long long total_timeout = 0;
1609 stonith_topology_t *tp = find_topology_for_host(op->target);
1610
1611 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1612 int i;
1613 GList *device_list = NULL;
1614 GList *iter = NULL;
1615 GList *auto_list = NULL;
1616
1617 if (pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)
1618 && (op->automatic_list != NULL)) {
1619 auto_list = g_list_copy(op->automatic_list);
1620 }
1621
1622
1623
1624
1625
1626
1627
1628
1629 for (i = 0; i < ST__LEVEL_COUNT; i++) {
1630 if (!tp->levels[i]) {
1631 continue;
1632 }
1633 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1634 bool found = false;
1635
1636 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1637 const peer_device_info_t *peer = iter->data;
1638
1639 if (auto_list) {
1640 GList *match = g_list_find_custom(auto_list, device_list->data,
1641 sort_strings);
1642 if (match) {
1643 auto_list = g_list_remove(auto_list, match->data);
1644 }
1645 }
1646
1647 if (find_peer_device(op, peer, device_list->data,
1648 fenced_support_flag(op->action))) {
1649 total_timeout += get_device_timeout(op, peer,
1650 device_list->data,
1651 true);
1652 found = true;
1653 break;
1654 }
1655 }
1656
1657
1658
1659
1660 if (!found && is_watchdog_fencing(op, device_list->data)) {
1661 total_timeout += stonith_watchdog_timeout_ms / 1000;
1662 }
1663 }
1664 }
1665
1666
1667 if (auto_list) {
1668 for (iter = auto_list; iter != NULL; iter = iter->next) {
1669 GList *iter2 = NULL;
1670
1671 for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
1672 peer_device_info_t *peer = iter2->data;
1673 if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
1674 total_timeout += get_device_timeout(op, peer,
1675 iter->data, true);
1676 break;
1677 }
1678 }
1679 }
1680 }
1681
1682 g_list_free(auto_list);
1683
1684 } else if (chosen_peer) {
1685 total_timeout = get_peer_timeout(op, chosen_peer);
1686
1687 } else {
1688 total_timeout = valid_fencing_timeout(op->base_timeout, false, op,
1689 NULL);
1690 }
1691
1692 if (total_timeout <= 0) {
1693 total_timeout = op->base_timeout;
1694 }
1695
1696
1697
1698
1699 if (op->client_delay > 0) {
1700 total_timeout += op->client_delay;
1701 }
1702 return (int) QB_MIN(total_timeout, INT_MAX);
1703 }
1704
1705 static void
1706 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1707 {
1708 GList *iter = NULL;
1709 xmlNode *update = NULL;
1710 const char *client_node = NULL;
1711 const char *client_id = NULL;
1712 const char *call_id = NULL;
1713
1714 if (op->call_options & st_opt_sync_call) {
1715
1716
1717
1718
1719 return;
1720 } else if (!op->request) {
1721 return;
1722 }
1723
1724 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1725 client_node = crm_element_value(op->request, PCMK__XA_ST_CLIENTNODE);
1726 call_id = crm_element_value(op->request, PCMK__XA_ST_CALLID);
1727 client_id = crm_element_value(op->request, PCMK__XA_ST_CLIENTID);
1728 if (!client_node || !call_id || !client_id) {
1729 return;
1730 }
1731
1732 if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1733
1734 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1735 return;
1736 }
1737
1738
1739 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1740 crm_xml_add(update, PCMK__XA_ST_REMOTE_OP, op->id);
1741 crm_xml_add(update, PCMK__XA_ST_CLIENTID, client_id);
1742 crm_xml_add(update, PCMK__XA_ST_CALLID, call_id);
1743 crm_xml_add_int(update, PCMK__XA_ST_TIMEOUT, op_timeout);
1744
1745 pcmk__cluster_send_message(pcmk__get_node(0, client_node, NULL,
1746 pcmk__node_search_cluster_member),
1747 crm_msg_stonith_ng, update);
1748
1749 free_xml(update);
1750
1751 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1752 remote_fencing_op_t *dup = iter->data;
1753
1754 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1755 dup->id, dup->client_name);
1756 report_timeout_period(iter->data, op_timeout);
1757 }
1758 }
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768 static void
1769 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1770 xmlNode *msg)
1771 {
1772
1773 if (op->devices) {
1774 op->devices = op->devices->next;
1775 }
1776
1777
1778 if ((op->phase == st_phase_requested)
1779 && pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)) {
1780
1781 remove_required_device(op, device);
1782
1783
1784
1785
1786 if (op->devices == NULL) {
1787 op->devices = op->automatic_list;
1788 }
1789 }
1790
1791 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1792
1793
1794
1795
1796 op_phase_on(op);
1797 }
1798
1799
1800 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1801
1802 if (op->devices) {
1803
1804 crm_trace("Next targeting %s on behalf of %s@%s",
1805 op->target, op->client_name, op->originator);
1806
1807
1808 if (op->client_delay > 0) {
1809 op->client_delay = 0;
1810 }
1811
1812 request_peer_fencing(op, NULL);
1813 } else {
1814
1815 crm_trace("Marking complex fencing op targeting %s as complete",
1816 op->target);
1817 op->state = st_done;
1818 finalize_op(op, msg, false);
1819 }
1820 }
1821
1822 static gboolean
1823 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
1824 {
1825 if (node_does_watchdog_fencing(op->target)) {
1826 guint timeout_ms = QB_MIN(stonith_watchdog_timeout_ms, UINT_MAX);
1827
1828 crm_notice("Waiting %s for %s to self-fence (%s) for "
1829 "client %s " CRM_XS " id=%.8s",
1830 pcmk__readable_interval(timeout_ms), op->target, op->action,
1831 op->client_name, op->id);
1832
1833 if (op->op_timer_one) {
1834 g_source_remove(op->op_timer_one);
1835 }
1836 op->op_timer_one = g_timeout_add(timeout_ms, remote_op_watchdog_done,
1837 op);
1838 return TRUE;
1839 } else {
1840 crm_debug("Skipping fallback to watchdog-fencing as %s is "
1841 "not in host-list", op->target);
1842 }
1843 return FALSE;
1844 }
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854 static void
1855 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
1856 {
1857 const char *device = NULL;
1858 int timeout;
1859
1860 CRM_CHECK(op != NULL, return);
1861
1862 crm_trace("Action %.8s targeting %s for %s is %s",
1863 op->id, op->target, op->client_name,
1864 stonith_op_state_str(op->state));
1865
1866 if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1867
1868
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878 device = op->devices->data;
1879 if (pcmk__str_eq(fenced_device_reboot_action(device), PCMK_ACTION_OFF,
1880 pcmk__str_none)) {
1881 crm_info("Not turning %s back on using %s because the device is "
1882 "configured to stay off (pcmk_reboot_action='off')",
1883 op->target, device);
1884 advance_topology_device_in_level(op, device, NULL);
1885 return;
1886 }
1887 if (!fenced_device_supports_on(device)) {
1888 crm_info("Not turning %s back on using %s because the agent "
1889 "doesn't support 'on'", op->target, device);
1890 advance_topology_device_in_level(op, device, NULL);
1891 return;
1892 }
1893 }
1894
1895 timeout = op->base_timeout;
1896 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1897 peer = stonith_choose_peer(op);
1898 }
1899
1900 if (!op->op_timer_total) {
1901 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1902 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1903 report_timeout_period(op, op->total_timeout);
1904 crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s "
1905 CRM_XS " id=%.8s",
1906 op->total_timeout, op->target, op->client_name, op->id);
1907 }
1908
1909 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1910
1911
1912
1913
1914
1915
1916
1917
1918 peer = stonith_choose_peer(op);
1919
1920 device = op->devices->data;
1921
1922
1923
1924
1925 timeout = get_device_timeout(op, peer, device, false);
1926 }
1927
1928 if (peer) {
1929 int timeout_one = 0;
1930 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1931 const crm_node_t *peer_node =
1932 pcmk__get_node(0, peer->host, NULL,
1933 pcmk__node_search_cluster_member);
1934
1935 if (op->client_delay > 0) {
1936
1937
1938
1939 timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay;
1940 }
1941
1942 crm_xml_add(remote_op, PCMK__XA_ST_REMOTE_OP, op->id);
1943 crm_xml_add(remote_op, PCMK__XA_ST_TARGET, op->target);
1944 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ACTION, op->action);
1945 crm_xml_add(remote_op, PCMK__XA_ST_ORIGIN, op->originator);
1946 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTID, op->client_id);
1947 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTNAME, op->client_name);
1948 crm_xml_add_int(remote_op, PCMK__XA_ST_TIMEOUT, timeout);
1949 crm_xml_add_int(remote_op, PCMK__XA_ST_CALLOPT, op->call_options);
1950 crm_xml_add_int(remote_op, PCMK__XA_ST_DELAY, op->client_delay);
1951
1952 if (device) {
1953 timeout_one += TIMEOUT_MULTIPLY_FACTOR *
1954 get_device_timeout(op, peer, device, true);
1955 crm_notice("Requesting that %s perform '%s' action targeting %s "
1956 "using %s " CRM_XS " for client %s (%ds)",
1957 peer->host, op->action, op->target, device,
1958 op->client_name, timeout_one);
1959 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ID, device);
1960
1961 } else {
1962 timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1963 crm_notice("Requesting that %s perform '%s' action targeting %s "
1964 CRM_XS " for client %s (%ds, %s)",
1965 peer->host, op->action, op->target, op->client_name,
1966 timeout_one,
1967 pcmk__readable_interval(stonith_watchdog_timeout_ms));
1968 }
1969
1970 op->state = st_exec;
1971 if (op->op_timer_one) {
1972 g_source_remove(op->op_timer_one);
1973 op->op_timer_one = 0;
1974 }
1975
1976 if (!is_watchdog_fencing(op, device)
1977 || !check_watchdog_fencing_and_wait(op)) {
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
2000 }
2001
2002 pcmk__cluster_send_message(peer_node, crm_msg_stonith_ng, remote_op);
2003 peer->tried = TRUE;
2004 free_xml(remote_op);
2005 return;
2006
2007 } else if (op->phase == st_phase_on) {
2008
2009
2010
2011 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
2012 "after successful 'off'", device, op->target);
2013 advance_topology_device_in_level(op, device, NULL);
2014 return;
2015
2016 } else if (op->owner == FALSE) {
2017 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
2018 op->action, op->target, op->client_name);
2019
2020 } else if (op->query_timer == 0) {
2021
2022 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
2023 CRM_XS " state=%s", op->action, op->target, op->client_name,
2024 stonith_op_state_str(op->state));
2025 CRM_CHECK(op->state < st_done, return);
2026 finalize_timed_out_op(op, "All nodes failed, or are unable, to "
2027 "fence target");
2028
2029 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
2030
2031
2032
2033
2034 if (is_watchdog_fencing(op, device)
2035 && check_watchdog_fencing_and_wait(op)) {
2036
2037
2038
2039
2040
2041 op->state = st_exec;
2042 return;
2043 }
2044
2045 if (op->state == st_query) {
2046 crm_info("No peers (out of %d) have devices capable of fencing "
2047 "(%s) %s for client %s " CRM_XS " state=%s",
2048 op->replies, op->action, op->target, op->client_name,
2049 stonith_op_state_str(op->state));
2050
2051 pcmk__reset_result(&op->result);
2052 pcmk__set_result(&op->result, CRM_EX_ERROR,
2053 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2054 } else {
2055 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2056 pcmk__reset_result(&op->result);
2057 pcmk__set_result(&op->result, CRM_EX_ERROR,
2058 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2059 }
2060
2061
2062
2063
2064
2065
2066
2067 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
2068 "for client %s " CRM_XS " state=%s",
2069 op->replies, op->action, op->target, op->client_name,
2070 stonith_op_state_str(op->state));
2071 }
2072
2073 op->state = st_failed;
2074 finalize_op(op, NULL, false);
2075
2076 } else {
2077 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
2078 "for client %s " CRM_XS " id=%.8s",
2079 op->action, op->target, (device? " using " : ""),
2080 (device? device : ""), op->client_name, op->id);
2081 }
2082 }
2083
2084
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095 static gint
2096 sort_peers(gconstpointer a, gconstpointer b)
2097 {
2098 const peer_device_info_t *peer_a = a;
2099 const peer_device_info_t *peer_b = b;
2100
2101 return (peer_b->ndevices - peer_a->ndevices);
2102 }
2103
2104
2105
2106
2107
2108
2109
2110 static gboolean
2111 all_topology_devices_found(const remote_fencing_op_t *op)
2112 {
2113 GList *device = NULL;
2114 GList *iter = NULL;
2115 device_properties_t *match = NULL;
2116 stonith_topology_t *tp = NULL;
2117 gboolean skip_target = FALSE;
2118 int i;
2119
2120 tp = find_topology_for_host(op->target);
2121 if (!tp) {
2122 return FALSE;
2123 }
2124 if (pcmk__is_fencing_action(op->action)) {
2125
2126
2127 skip_target = TRUE;
2128 }
2129
2130 for (i = 0; i < ST__LEVEL_COUNT; i++) {
2131 for (device = tp->levels[i]; device; device = device->next) {
2132 match = NULL;
2133 for (iter = op->query_results; iter && !match; iter = iter->next) {
2134 peer_device_info_t *peer = iter->data;
2135
2136 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
2137 continue;
2138 }
2139 match = find_peer_device(op, peer, device->data, st_device_supports_none);
2140 }
2141 if (!match) {
2142 return FALSE;
2143 }
2144 }
2145 }
2146
2147 return TRUE;
2148 }
2149
2150
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162 static void
2163 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
2164 const char *action, remote_fencing_op_t *op,
2165 enum st_remap_phase phase, device_properties_t *props)
2166 {
2167 props->custom_action_timeout[phase] = 0;
2168 crm_element_value_int(xml, PCMK__XA_ST_ACTION_TIMEOUT,
2169 &props->custom_action_timeout[phase]);
2170 if (props->custom_action_timeout[phase]) {
2171 crm_trace("Peer %s with device %s returned %s action timeout %ds",
2172 peer, device, action, props->custom_action_timeout[phase]);
2173 }
2174
2175 props->delay_max[phase] = 0;
2176 crm_element_value_int(xml, PCMK__XA_ST_DELAY_MAX, &props->delay_max[phase]);
2177 if (props->delay_max[phase]) {
2178 crm_trace("Peer %s with device %s returned maximum of random delay %ds for %s",
2179 peer, device, props->delay_max[phase], action);
2180 }
2181
2182 props->delay_base[phase] = 0;
2183 crm_element_value_int(xml, PCMK__XA_ST_DELAY_BASE,
2184 &props->delay_base[phase]);
2185 if (props->delay_base[phase]) {
2186 crm_trace("Peer %s with device %s returned base delay %ds for %s",
2187 peer, device, props->delay_base[phase], action);
2188 }
2189
2190
2191 if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
2192 int required = 0;
2193
2194 crm_element_value_int(xml, PCMK__XA_ST_REQUIRED, &required);
2195 if (required) {
2196 crm_trace("Peer %s requires device %s to execute for action %s",
2197 peer, device, action);
2198 add_required_device(op, device);
2199 }
2200 }
2201
2202
2203
2204
2205 if (pcmk__xe_attr_is_true(xml, PCMK__XA_ST_ACTION_DISALLOWED)) {
2206 props->disallowed[phase] = TRUE;
2207 crm_trace("Peer %s is disallowed from executing %s for device %s",
2208 peer, action, device);
2209 }
2210 }
2211
2212
2213
2214
2215
2216
2217
2218
2219
2220
2221 static void
2222 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
2223 peer_device_info_t *peer, const char *device)
2224 {
2225 xmlNode *child;
2226 int verified = 0;
2227 device_properties_t *props =
2228 pcmk__assert_alloc(1, sizeof(device_properties_t));
2229 int rc = pcmk_rc_ok;
2230
2231
2232 g_hash_table_insert(peer->devices, pcmk__str_copy(device), props);
2233
2234
2235 crm_element_value_int(xml, PCMK__XA_ST_MONITOR_VERIFIED, &verified);
2236 if (verified) {
2237 crm_trace("Peer %s has confirmed a verified device %s",
2238 peer->host, device);
2239 props->verified = TRUE;
2240 }
2241
2242
2243 rc = pcmk__xe_get_flags(xml, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS,
2244 &(props->device_support_flags),
2245 st_device_supports_on);
2246 if (rc != pcmk_rc_ok) {
2247 crm_warn("Couldn't determine device support for %s "
2248 "(assuming unfencing): %s", device, pcmk_rc_str(rc));
2249 }
2250
2251
2252 parse_action_specific(xml, peer->host, device, op_requested_action(op),
2253 op, st_phase_requested, props);
2254 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2255 child = pcmk__xe_next(child)) {
2256
2257
2258
2259
2260 if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_OFF, pcmk__str_none)) {
2261 parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF,
2262 op, st_phase_off, props);
2263
2264 } else if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_ON,
2265 pcmk__str_none)) {
2266 parse_action_specific(child, peer->host, device, PCMK_ACTION_ON,
2267 op, st_phase_on, props);
2268 }
2269 }
2270 }
2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283 static peer_device_info_t *
2284 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
2285 const xmlNode *xml)
2286 {
2287 peer_device_info_t *peer = pcmk__assert_alloc(1,
2288 sizeof(peer_device_info_t));
2289 xmlNode *child;
2290
2291 peer->host = pcmk__str_copy(host);
2292 peer->devices = pcmk__strkey_table(free, free);
2293
2294
2295 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2296 child = pcmk__xe_next(child)) {
2297 const char *device = pcmk__xe_id(child);
2298
2299 if (device) {
2300 add_device_properties(child, op, peer, device);
2301 }
2302 }
2303
2304 peer->ndevices = g_hash_table_size(peer->devices);
2305 CRM_CHECK(ndevices == peer->ndevices,
2306 crm_err("Query claimed to have %d device%s but %d found",
2307 ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2308
2309 op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2310 return peer;
2311 }
2312
2313
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327 int
2328 process_remote_stonith_query(xmlNode *msg)
2329 {
2330 int ndevices = 0;
2331 gboolean host_is_target = FALSE;
2332 gboolean have_all_replies = FALSE;
2333 const char *id = NULL;
2334 const char *host = NULL;
2335 remote_fencing_op_t *op = NULL;
2336 peer_device_info_t *peer = NULL;
2337 uint32_t replies_expected;
2338 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
2339
2340 CRM_CHECK(dev != NULL, return -EPROTO);
2341
2342 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2343 CRM_CHECK(id != NULL, return -EPROTO);
2344
2345 dev = get_xpath_object("//@" PCMK__XA_ST_AVAILABLE_DEVICES, msg, LOG_ERR);
2346 CRM_CHECK(dev != NULL, return -EPROTO);
2347 crm_element_value_int(dev, PCMK__XA_ST_AVAILABLE_DEVICES, &ndevices);
2348
2349 op = g_hash_table_lookup(stonith_remote_op_list, id);
2350 if (op == NULL) {
2351 crm_debug("Received query reply for unknown or expired operation %s",
2352 id);
2353 return -EOPNOTSUPP;
2354 }
2355
2356 replies_expected = fencing_active_peers();
2357 if (op->replies_expected < replies_expected) {
2358 replies_expected = op->replies_expected;
2359 }
2360 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2361 have_all_replies = TRUE;
2362 }
2363 host = crm_element_value(msg, PCMK__XA_SRC);
2364 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2365
2366 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2367 op->replies, replies_expected, host,
2368 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2369 if (ndevices > 0) {
2370 peer = add_result(op, host, ndevices, dev);
2371 }
2372
2373 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2374
2375 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2376
2377
2378
2379 if (op->state == st_query && all_topology_devices_found(op)) {
2380
2381 crm_trace("All topology devices found");
2382 request_peer_fencing(op, peer);
2383
2384 } else if (have_all_replies) {
2385 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2386 replies_expected, op->replies);
2387 request_peer_fencing(op, NULL);
2388 }
2389
2390 } else if (op->state == st_query) {
2391 int nverified = count_peer_devices(op, peer, TRUE,
2392 fenced_support_flag(op->action));
2393
2394
2395
2396 if ((peer != NULL) && !host_is_target && nverified) {
2397
2398 crm_trace("Found %d verified device%s",
2399 nverified, pcmk__plural_s(nverified));
2400 request_peer_fencing(op, peer);
2401
2402 } else if (have_all_replies) {
2403 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2404 replies_expected, op->replies);
2405 request_peer_fencing(op, NULL);
2406
2407 } else {
2408 crm_trace("Waiting for more peer results before launching fencing operation");
2409 }
2410
2411 } else if ((peer != NULL) && (op->state == st_done)) {
2412 crm_info("Discarding query result from %s (%d device%s): "
2413 "Operation is %s", peer->host,
2414 peer->ndevices, pcmk__plural_s(peer->ndevices),
2415 stonith_op_state_str(op->state));
2416 }
2417
2418 return pcmk_ok;
2419 }
2420
2421
2422
2423
2424
2425
2426
2427
2428
2429
2430 void
2431 fenced_process_fencing_reply(xmlNode *msg)
2432 {
2433 const char *id = NULL;
2434 const char *device = NULL;
2435 remote_fencing_op_t *op = NULL;
2436 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
2437 pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2438
2439 CRM_CHECK(dev != NULL, return);
2440
2441 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2442 CRM_CHECK(id != NULL, return);
2443
2444 dev = stonith__find_xe_with_result(msg);
2445 CRM_CHECK(dev != NULL, return);
2446
2447 stonith__xe_get_result(dev, &result);
2448
2449 device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID);
2450
2451 if (stonith_remote_op_list) {
2452 op = g_hash_table_lookup(stonith_remote_op_list, id);
2453 }
2454
2455 if ((op == NULL) && pcmk__result_ok(&result)) {
2456
2457 const char *client_id = crm_element_value(dev, PCMK__XA_ST_CLIENTID);
2458
2459 op = create_remote_stonith_op(client_id, dev, TRUE);
2460 }
2461
2462 if (op == NULL) {
2463
2464
2465 crm_info("Received peer result of unknown or expired operation %s", id);
2466 pcmk__reset_result(&result);
2467 return;
2468 }
2469
2470 pcmk__reset_result(&op->result);
2471 op->result = result;
2472
2473 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2474 crm_err("Received outdated reply for device %s (instead of %s) to "
2475 "fence (%s) %s. Operation already timed out at peer level.",
2476 device, (const char *) op->devices->data, op->action, op->target);
2477 return;
2478 }
2479
2480 if (pcmk__str_eq(crm_element_value(msg, PCMK__XA_SUBT),
2481 PCMK__VALUE_BROADCAST, pcmk__str_none)) {
2482
2483 if (pcmk__result_ok(&op->result)) {
2484 op->state = st_done;
2485 } else {
2486 op->state = st_failed;
2487 }
2488 finalize_op(op, msg, false);
2489 return;
2490
2491 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2492
2493
2494 crm_err("Received non-broadcast fencing result for operation %.8s "
2495 "we do not own (device %s targeting %s)",
2496 op->id, device, op->target);
2497 return;
2498 }
2499
2500 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2501 const char *device = NULL;
2502 const char *reason = op->result.exit_reason;
2503
2504
2505
2506 if (op->state == st_done) {
2507 finalize_op(op, msg, false);
2508 return;
2509 }
2510
2511 device = crm_element_value(msg, PCMK__XA_ST_DEVICE_ID);
2512
2513 if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2514
2515
2516
2517 crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2518 "after successful 'off'",
2519 device, pcmk_exec_status_str(op->result.execution_status),
2520 (reason == NULL)? "" : ": ",
2521 (reason == NULL)? "" : reason,
2522 op->target);
2523 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2524 } else {
2525 crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2526 "%s%s%s%s",
2527 op->action, op->target,
2528 ((device == NULL)? "" : " using "),
2529 ((device == NULL)? "" : device),
2530 op->client_name,
2531 op->originator,
2532 pcmk_exec_status_str(op->result.execution_status),
2533 (reason == NULL)? "" : " (",
2534 (reason == NULL)? "" : reason,
2535 (reason == NULL)? "" : ")");
2536 }
2537
2538 if (pcmk__result_ok(&op->result)) {
2539
2540
2541 advance_topology_device_in_level(op, device, msg);
2542 return;
2543 } else {
2544
2545
2546 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2547 op->state = st_failed;
2548 finalize_op(op, msg, false);
2549 return;
2550 }
2551 }
2552
2553 } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2554 op->state = st_done;
2555 finalize_op(op, msg, false);
2556 return;
2557
2558 } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2559 && (op->devices == NULL)) {
2560
2561 op->state = st_failed;
2562 finalize_op(op, msg, false);
2563 return;
2564
2565 } else {
2566
2567 }
2568
2569
2570 crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2571 op->target, op->originator, op->client_name,
2572 pcmk_exec_status_str(op->result.execution_status));
2573 request_peer_fencing(op, NULL);
2574 }
2575
2576 gboolean
2577 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2578 {
2579 GHashTableIter iter;
2580 time_t now = time(NULL);
2581 remote_fencing_op_t *rop = NULL;
2582
2583 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2584 action == NULL) {
2585 return FALSE;
2586 }
2587
2588 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2589 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2590 if (strcmp(rop->target, target) != 0) {
2591 continue;
2592 } else if (rop->state != st_done) {
2593 continue;
2594
2595
2596
2597 } else if (strcmp(rop->action, action) != 0) {
2598 continue;
2599 } else if ((rop->completed + tolerance) < now) {
2600 continue;
2601 }
2602
2603 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2604 target, action, tolerance, rop->delegate, rop->originator);
2605 return TRUE;
2606 }
2607 return FALSE;
2608 }