This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- fencing_result2xml
- fenced_broadcast_op_result
- handle_local_reply_and_notify
- finalize_op_duplicates
- delegate_from_xml
- finalize_op
- remote_op_watchdog_done
- remote_op_timeout_one
- finalize_timed_out_op
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- fenced_handle_manual_confirmation
- create_remote_stonith_op
- initiate_remote_stonith_op
- is_watchdog_fencing
- find_best_peer
- stonith_choose_peer
- valid_fencing_timeout
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- check_watchdog_fencing_and_wait
- request_peer_fencing
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- fenced_process_fencing_reply
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/common/ipc.h>
28 #include <crm/common/ipc_internal.h>
29 #include <crm/cluster/internal.h>
30
31 #include <crm/stonith-ng.h>
32 #include <crm/fencing/internal.h>
33 #include <crm/common/xml.h>
34 #include <crm/common/xml_internal.h>
35
36 #include <crm/common/util.h>
37 #include <pacemaker-fenced.h>
38
39 #define TIMEOUT_MULTIPLY_FACTOR 1.2
40
41
42
43
44
45
46
47 typedef struct device_properties_s {
48
49 gboolean verified;
50
51
52
53
54 gboolean executed[st_phase_max];
55
56 gboolean disallowed[st_phase_max];
57
58 int custom_action_timeout[st_phase_max];
59
60 int delay_max[st_phase_max];
61
62 int delay_base[st_phase_max];
63
64 uint32_t device_support_flags;
65 } device_properties_t;
66
67 typedef struct {
68
69 char *host;
70
71 gboolean tried;
72
73 int ndevices;
74
75 GHashTable *devices;
76 } peer_device_info_t;
77
78 GHashTable *stonith_remote_op_list = NULL;
79
80 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
81 int call_options);
82
83 static void request_peer_fencing(remote_fencing_op_t *op,
84 peer_device_info_t *peer);
85 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
86 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
87 static int get_op_total_timeout(const remote_fencing_op_t *op,
88 const peer_device_info_t *chosen_peer);
89
90 static gint
91 sort_strings(gconstpointer a, gconstpointer b)
92 {
93 return strcmp(a, b);
94 }
95
96 static void
97 free_remote_query(gpointer data)
98 {
99 if (data != NULL) {
100 peer_device_info_t *peer = data;
101
102 g_hash_table_destroy(peer->devices);
103 free(peer->host);
104 free(peer);
105 }
106 }
107
108 void
109 free_stonith_remote_op_list(void)
110 {
111 if (stonith_remote_op_list != NULL) {
112 g_hash_table_destroy(stonith_remote_op_list);
113 stonith_remote_op_list = NULL;
114 }
115 }
116
117 struct peer_count_data {
118 const remote_fencing_op_t *op;
119 gboolean verified_only;
120 uint32_t support_action_only;
121 int count;
122 };
123
124
125
126
127
128
129
130
131
132 static void
133 count_peer_device(gpointer key, gpointer value, gpointer user_data)
134 {
135 device_properties_t *props = (device_properties_t*)value;
136 struct peer_count_data *data = user_data;
137
138 if (!props->executed[data->op->phase]
139 && (!data->verified_only || props->verified)
140 && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
141 ++(data->count);
142 }
143 }
144
145
146
147
148
149
150
151
152
153
154
155
156 static int
157 count_peer_devices(const remote_fencing_op_t *op,
158 const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
159 {
160 struct peer_count_data data;
161
162 data.op = op;
163 data.verified_only = verified_only;
164 data.support_action_only = support_on_action_only;
165 data.count = 0;
166 if (peer) {
167 g_hash_table_foreach(peer->devices, count_peer_device, &data);
168 }
169 return data.count;
170 }
171
172
173
174
175
176
177
178
179
180
181
182 static device_properties_t *
183 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
184 const char *device, uint32_t support_action_only)
185 {
186 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
187
188 if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
189 return NULL;
190 }
191 return (props && !props->executed[op->phase]
192 && !props->disallowed[op->phase])? props : NULL;
193 }
194
195
196
197
198
199
200
201
202
203
204
205
206 static gboolean
207 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
208 const char *device, gboolean verified_devices_only)
209 {
210 device_properties_t *props = find_peer_device(op, peer, device,
211 fenced_support_flag(op->action));
212
213 if ((props == NULL) || (verified_devices_only && !props->verified)) {
214 return FALSE;
215 }
216
217 crm_trace("Removing %s from %s (%d remaining)",
218 device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
219 props->executed[op->phase] = TRUE;
220 return TRUE;
221 }
222
223 static void
224 clear_remote_op_timers(remote_fencing_op_t * op)
225 {
226 if (op->query_timer) {
227 g_source_remove(op->query_timer);
228 op->query_timer = 0;
229 }
230 if (op->op_timer_total) {
231 g_source_remove(op->op_timer_total);
232 op->op_timer_total = 0;
233 }
234 if (op->op_timer_one) {
235 g_source_remove(op->op_timer_one);
236 op->op_timer_one = 0;
237 }
238 }
239
240 static void
241 free_remote_op(gpointer data)
242 {
243 remote_fencing_op_t *op = data;
244
245 crm_log_xml_debug(op->request, "Destroying");
246
247 clear_remote_op_timers(op);
248
249 free(op->id);
250 free(op->action);
251 free(op->delegate);
252 free(op->target);
253 free(op->client_id);
254 free(op->client_name);
255 free(op->originator);
256
257 if (op->query_results) {
258 g_list_free_full(op->query_results, free_remote_query);
259 }
260 if (op->request) {
261 free_xml(op->request);
262 op->request = NULL;
263 }
264 if (op->devices_list) {
265 g_list_free_full(op->devices_list, free);
266 op->devices_list = NULL;
267 }
268 g_list_free_full(op->automatic_list, free);
269 g_list_free(op->duplicates);
270
271 pcmk__reset_result(&op->result);
272 free(op);
273 }
274
275 void
276 init_stonith_remote_op_hash_table(GHashTable **table)
277 {
278 if (*table == NULL) {
279 *table = pcmk__strkey_table(NULL, free_remote_op);
280 }
281 }
282
283
284
285
286
287
288
289
290
291 static const char *
292 op_requested_action(const remote_fencing_op_t *op)
293 {
294 return ((op->phase > st_phase_requested)? PCMK_ACTION_REBOOT : op->action);
295 }
296
297
298
299
300
301
302
303 static void
304 op_phase_off(remote_fencing_op_t *op)
305 {
306 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
307 CRM_XS " id=%.8s", op->target, op->id);
308 op->phase = st_phase_off;
309
310
311
312
313 strcpy(op->action, PCMK_ACTION_OFF);
314 }
315
316
317
318
319
320
321
322 static void
323 op_phase_on(remote_fencing_op_t *op)
324 {
325 GList *iter = NULL;
326
327 crm_info("Remapped 'off' targeting %s complete, "
328 "remapping to 'on' for %s " CRM_XS " id=%.8s",
329 op->target, op->client_name, op->id);
330 op->phase = st_phase_on;
331 strcpy(op->action, PCMK_ACTION_ON);
332
333
334
335
336 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
337 GList *match = g_list_find_custom(op->devices_list, iter->data,
338 sort_strings);
339
340 if (match) {
341 op->devices_list = g_list_remove(op->devices_list, match->data);
342 }
343 }
344 g_list_free_full(op->automatic_list, free);
345 op->automatic_list = NULL;
346
347
348 op->devices = op->devices_list;
349 }
350
351
352
353
354
355
356
357 static void
358 undo_op_remap(remote_fencing_op_t *op)
359 {
360 if (op->phase > 0) {
361 crm_info("Undoing remap of reboot targeting %s for %s "
362 CRM_XS " id=%.8s", op->target, op->client_name, op->id);
363 op->phase = st_phase_requested;
364 strcpy(op->action, PCMK_ACTION_REBOOT);
365 }
366 }
367
368
369
370
371
372
373
374
375
376
377
378 static xmlNode *
379 fencing_result2xml(xmlNode *parent, const remote_fencing_op_t *op)
380 {
381 xmlNode *notify_data = pcmk__xe_create(parent, PCMK__XE_ST_NOTIFY_FENCE);
382
383 crm_xml_add_int(notify_data, PCMK_XA_STATE, op->state);
384 crm_xml_add(notify_data, PCMK__XA_ST_TARGET, op->target);
385 crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ACTION, op->action);
386 crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, op->delegate);
387 crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, op->id);
388 crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, op->originator);
389 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTID, op->client_id);
390 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTNAME, op->client_name);
391
392 return notify_data;
393 }
394
395
396
397
398
399
400
401
402 void
403 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
404 {
405 static int count = 0;
406 xmlNode *bcast = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY);
407 xmlNode *wrapper = NULL;
408 xmlNode *notify_data = NULL;
409
410 count++;
411 crm_trace("Broadcasting result to peers");
412 crm_xml_add(bcast, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY);
413 crm_xml_add(bcast, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST);
414 crm_xml_add(bcast, PCMK__XA_ST_OP, STONITH_OP_NOTIFY);
415 crm_xml_add_int(bcast, PCMK_XA_COUNT, count);
416
417 if (op_merged) {
418 pcmk__xe_set_bool_attr(bcast, PCMK__XA_ST_OP_MERGED, true);
419 }
420
421 wrapper = pcmk__xe_create(bcast, PCMK__XE_ST_CALLDATA);
422 notify_data = fencing_result2xml(wrapper, op);
423 stonith__xe_set_result(notify_data, &op->result);
424
425 pcmk__cluster_send_message(NULL, crm_msg_stonith_ng, bcast);
426 free_xml(bcast);
427
428 return;
429 }
430
431
432
433
434
435
436
437
438 static void
439 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
440 {
441 xmlNode *notify_data = NULL;
442 xmlNode *reply = NULL;
443 pcmk__client_t *client = NULL;
444
445 if (op->notify_sent == TRUE) {
446
447 return;
448 }
449
450
451 crm_xml_add_int(data, PCMK_XA_STATE, op->state);
452 crm_xml_add(data, PCMK__XA_ST_TARGET, op->target);
453 crm_xml_add(data, PCMK__XA_ST_OP, op->action);
454
455 reply = fenced_construct_reply(op->request, data, &op->result);
456 crm_xml_add(reply, PCMK__XA_ST_DELEGATE, op->delegate);
457
458
459 client = pcmk__find_client_by_id(op->client_id);
460 if (client == NULL) {
461 crm_trace("Skipping reply to %s: no longer a client", op->client_id);
462 } else {
463 do_local_reply(reply, client, op->call_options);
464 }
465
466
467 notify_data = fencing_result2xml(NULL, op);
468 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, &op->result,
469 notify_data);
470 free_xml(notify_data);
471 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
472
473
474 op->notify_sent = TRUE;
475 free_xml(reply);
476 }
477
478
479
480
481
482
483
484
485 static void
486 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
487 {
488 for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
489 remote_fencing_op_t *other = iter->data;
490
491 if (other->state == st_duplicate) {
492 other->state = op->state;
493 crm_debug("Performing duplicate notification for %s@%s: %s "
494 CRM_XS " id=%.8s",
495 other->client_name, other->originator,
496 pcmk_exec_status_str(op->result.execution_status),
497 other->id);
498 pcmk__copy_result(&op->result, &other->result);
499 finalize_op(other, data, true);
500
501 } else {
502
503 crm_err("Skipping duplicate notification for %s@%s "
504 CRM_XS " state=%s id=%.8s",
505 other->client_name, other->originator,
506 stonith_op_state_str(other->state), other->id);
507 }
508 }
509 }
510
511 static char *
512 delegate_from_xml(xmlNode *xml)
513 {
514 xmlNode *match = get_xpath_object("//@" PCMK__XA_ST_DELEGATE, xml,
515 LOG_NEVER);
516
517 if (match == NULL) {
518 return crm_element_value_copy(xml, PCMK__XA_SRC);
519 } else {
520 return crm_element_value_copy(match, PCMK__XA_ST_DELEGATE);
521 }
522 }
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540 static void
541 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
542 {
543 int level = LOG_ERR;
544 const char *subt = NULL;
545 xmlNode *local_data = NULL;
546 gboolean op_merged = FALSE;
547
548 CRM_CHECK((op != NULL), return);
549
550
551 clear_remote_op_timers(op);
552
553 if (op->notify_sent) {
554
555 crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
556 "Result arrived too late " CRM_XS " id=%.8s",
557 op->action, (op->target? " targeting " : ""),
558 (op->target? op->target : ""),
559 (op->delegate? op->delegate : "unknown node"),
560 op->client_name, op->originator,
561 (op_merged? " (merged)" : ""),
562 op->id);
563 return;
564 }
565
566 set_fencing_completed(op);
567 undo_op_remap(op);
568
569 if (data == NULL) {
570 data = pcmk__xe_create(NULL, "remote-op");
571 local_data = data;
572
573 } else if (op->delegate == NULL) {
574 switch (op->result.execution_status) {
575 case PCMK_EXEC_NO_FENCE_DEVICE:
576 break;
577
578 case PCMK_EXEC_INVALID:
579 if (op->result.exit_status != CRM_EX_EXPIRED) {
580 op->delegate = delegate_from_xml(data);
581 }
582 break;
583
584 default:
585 op->delegate = delegate_from_xml(data);
586 break;
587 }
588 }
589
590 if (dup || (crm_element_value(data, PCMK__XA_ST_OP_MERGED) != NULL)) {
591 op_merged = true;
592 }
593
594
595
596
597 subt = crm_element_value(data, PCMK__XA_SUBT);
598 if (!dup && !pcmk__str_eq(subt, PCMK__VALUE_BROADCAST, pcmk__str_none)) {
599
600 fenced_broadcast_op_result(op, op_merged);
601 free_xml(local_data);
602 return;
603 }
604
605 if (pcmk__result_ok(&op->result) || dup
606 || !pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
607 level = LOG_NOTICE;
608 }
609 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
610 CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
611 (op->target? op->target : ""),
612 (op->delegate? op->delegate : "unknown node"),
613 op->client_name, op->originator,
614 (op_merged? " (merged)" : ""),
615 crm_exit_str(op->result.exit_status),
616 pcmk_exec_status_str(op->result.execution_status),
617 ((op->result.exit_reason == NULL)? "" : ": "),
618 ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
619 op->id);
620
621 handle_local_reply_and_notify(op, data);
622
623 if (!dup) {
624 finalize_op_duplicates(op, data);
625 }
626
627
628
629
630 if (op->query_results) {
631 g_list_free_full(op->query_results, free_remote_query);
632 op->query_results = NULL;
633 }
634 if (op->request) {
635 free_xml(op->request);
636 op->request = NULL;
637 }
638
639 free_xml(local_data);
640 }
641
642
643
644
645
646
647
648
649
650 static gboolean
651 remote_op_watchdog_done(gpointer userdata)
652 {
653 remote_fencing_op_t *op = userdata;
654
655 op->op_timer_one = 0;
656
657 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
658 CRM_XS " id=%.8s",
659 op->action, op->target, op->client_name, op->id);
660 op->state = st_done;
661 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
662 finalize_op(op, NULL, false);
663 return G_SOURCE_REMOVE;
664 }
665
666 static gboolean
667 remote_op_timeout_one(gpointer userdata)
668 {
669 remote_fencing_op_t *op = userdata;
670
671 op->op_timer_one = 0;
672
673 crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
674 " id=%.8s", op->action, op->target, op->client_name, op->id);
675 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
676 "Peer did not return fence result within timeout");
677
678
679 if (op->client_delay > 0) {
680 op->client_delay = 0;
681 crm_trace("Try another device for '%s' action targeting %s "
682 "for client %s without delay " CRM_XS " id=%.8s",
683 op->action, op->target, op->client_name, op->id);
684 }
685
686
687 request_peer_fencing(op, NULL);
688 return G_SOURCE_REMOVE;
689 }
690
691
692
693
694
695
696
697
698 static void
699 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
700 {
701 crm_debug("Action '%s' targeting %s for client %s timed out "
702 CRM_XS " id=%.8s",
703 op->action, op->target, op->client_name, op->id);
704
705 if (op->phase == st_phase_on) {
706
707
708
709
710 op->state = st_done;
711 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
712 } else {
713 op->state = st_failed;
714 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
715 }
716 finalize_op(op, NULL, false);
717 }
718
719
720
721
722
723
724
725
726
727 static gboolean
728 remote_op_timeout(gpointer userdata)
729 {
730 remote_fencing_op_t *op = userdata;
731
732 op->op_timer_total = 0;
733
734 if (op->state == st_done) {
735 crm_debug("Action '%s' targeting %s for client %s already completed "
736 CRM_XS " id=%.8s",
737 op->action, op->target, op->client_name, op->id);
738 } else {
739 finalize_timed_out_op(userdata, "Fencing did not complete within a "
740 "total timeout based on the "
741 "configured timeout and retries for "
742 "any devices attempted");
743 }
744 return G_SOURCE_REMOVE;
745 }
746
747 static gboolean
748 remote_op_query_timeout(gpointer data)
749 {
750 remote_fencing_op_t *op = data;
751
752 op->query_timer = 0;
753
754 if (op->state == st_done) {
755 crm_debug("Operation %.8s targeting %s already completed",
756 op->id, op->target);
757 } else if (op->state == st_exec) {
758 crm_debug("Operation %.8s targeting %s already in progress",
759 op->id, op->target);
760 } else if (op->query_results) {
761
762 crm_debug("Query %.8s targeting %s complete (state=%s)",
763 op->id, op->target, stonith_op_state_str(op->state));
764 request_peer_fencing(op, NULL);
765 } else {
766 crm_debug("Query %.8s targeting %s timed out (state=%s)",
767 op->id, op->target, stonith_op_state_str(op->state));
768 finalize_timed_out_op(op, "No capable peers replied to device query "
769 "within timeout");
770 }
771
772 return G_SOURCE_REMOVE;
773 }
774
775 static gboolean
776 topology_is_empty(stonith_topology_t *tp)
777 {
778 int i;
779
780 if (tp == NULL) {
781 return TRUE;
782 }
783
784 for (i = 0; i < ST__LEVEL_COUNT; i++) {
785 if (tp->levels[i] != NULL) {
786 return FALSE;
787 }
788 }
789 return TRUE;
790 }
791
792
793
794
795
796
797
798
799 static void
800 add_required_device(remote_fencing_op_t *op, const char *device)
801 {
802 GList *match = g_list_find_custom(op->automatic_list, device,
803 sort_strings);
804
805 if (!match) {
806 op->automatic_list = g_list_prepend(op->automatic_list,
807 pcmk__str_copy(device));
808 }
809 }
810
811
812
813
814
815
816
817
818 static void
819 remove_required_device(remote_fencing_op_t *op, const char *device)
820 {
821 GList *match = g_list_find_custom(op->automatic_list, device,
822 sort_strings);
823
824 if (match) {
825 op->automatic_list = g_list_remove(op->automatic_list, match->data);
826 }
827 }
828
829
830 static void
831 set_op_device_list(remote_fencing_op_t * op, GList *devices)
832 {
833 GList *lpc = NULL;
834
835 if (op->devices_list) {
836 g_list_free_full(op->devices_list, free);
837 op->devices_list = NULL;
838 }
839 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
840 const char *device = lpc->data;
841
842 op->devices_list = g_list_append(op->devices_list,
843 pcmk__str_copy(device));
844 }
845 op->devices = op->devices_list;
846 }
847
848
849
850
851
852
853
854
855
856
857 static gboolean
858 topology_matches(const stonith_topology_t *tp, const char *node)
859 {
860 regex_t r_patt;
861
862 CRM_CHECK(node && tp && tp->target, return FALSE);
863 switch (tp->kind) {
864 case fenced_target_by_attribute:
865
866
867
868
869
870
871 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
872 crm_notice("Matched %s with %s by attribute", node, tp->target);
873 return TRUE;
874 }
875 break;
876
877 case fenced_target_by_pattern:
878
879
880
881 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
882 crm_info("Bad regex '%s' for fencing level", tp->target);
883 } else {
884 int status = regexec(&r_patt, node, 0, NULL, 0);
885
886 regfree(&r_patt);
887 if (status == 0) {
888 crm_notice("Matched %s with %s by name", node, tp->target);
889 return TRUE;
890 }
891 }
892 break;
893
894 case fenced_target_by_name:
895 crm_trace("Testing %s against %s", node, tp->target);
896 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
897
898 default:
899 break;
900 }
901 crm_trace("No match for %s with %s", node, tp->target);
902 return FALSE;
903 }
904
905 stonith_topology_t *
906 find_topology_for_host(const char *host)
907 {
908 GHashTableIter tIter;
909 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
910
911 if(tp != NULL) {
912 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
913 return tp;
914 }
915
916 g_hash_table_iter_init(&tIter, topology);
917 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
918 if (topology_matches(tp, host)) {
919 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
920 return tp;
921 }
922 }
923
924 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
925 return NULL;
926 }
927
928
929
930
931
932
933
934
935
936
937
938
939 static int
940 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
941 {
942 stonith_topology_t *tp = NULL;
943
944 if (op->target) {
945 tp = find_topology_for_host(op->target);
946 }
947 if (topology_is_empty(tp)) {
948 return empty_ok? pcmk_rc_ok : ENODEV;
949 }
950
951 CRM_ASSERT(tp->levels != NULL);
952
953 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
954
955
956 undo_op_remap(op);
957
958 do {
959 op->level++;
960
961 } while (op->level < ST__LEVEL_COUNT && tp->levels[op->level] == NULL);
962
963 if (op->level < ST__LEVEL_COUNT) {
964 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
965 "for client %s@%s (id=%.8s)",
966 op->level, op->target, g_list_length(tp->levels[op->level]),
967 op->client_name, op->originator, op->id);
968 set_op_device_list(op, tp->levels[op->level]);
969
970
971 if ((op->level > 1) && (op->client_delay > 0)) {
972 op->client_delay = 0;
973 }
974
975 if ((g_list_next(op->devices_list) != NULL)
976 && pcmk__str_eq(op->action, PCMK_ACTION_REBOOT, pcmk__str_none)) {
977
978
979
980
981
982 op_phase_off(op);
983 }
984 return pcmk_rc_ok;
985 }
986
987 crm_info("All %sfencing options targeting %s for client %s@%s failed "
988 CRM_XS " id=%.8s",
989 (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
990 op->target, op->client_name, op->originator, op->id);
991 return ENODEV;
992 }
993
994
995
996
997
998
999
1000 static void
1001 merge_duplicates(remote_fencing_op_t *op)
1002 {
1003 GHashTableIter iter;
1004 remote_fencing_op_t *other = NULL;
1005
1006 time_t now = time(NULL);
1007
1008 g_hash_table_iter_init(&iter, stonith_remote_op_list);
1009 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
1010 const char *other_action = op_requested_action(other);
1011 crm_node_t *node = NULL;
1012
1013 if (!strcmp(op->id, other->id)) {
1014 continue;
1015 }
1016 if (other->state > st_exec) {
1017 crm_trace("%.8s not duplicate of %.8s: not in progress",
1018 op->id, other->id);
1019 continue;
1020 }
1021 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1022 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1023 op->id, other->id, op->target, other->target);
1024 continue;
1025 }
1026 if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1027 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1028 op->id, other->id, op->action, other_action);
1029 continue;
1030 }
1031 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1032 crm_trace("%.8s not duplicate of %.8s: same client %s",
1033 op->id, other->id, op->client_name);
1034 continue;
1035 }
1036 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1037 crm_trace("%.8s not duplicate of %.8s: suicide for %s",
1038 op->id, other->id, other->target);
1039 continue;
1040 }
1041
1042 node = pcmk__get_node(0, other->originator, NULL,
1043 pcmk__node_search_cluster_member);
1044
1045 if (!fencing_peer_active(node)) {
1046 crm_notice("Failing action '%s' targeting %s originating from "
1047 "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
1048 other->action, other->target, other->client_name,
1049 other->originator, other->id);
1050 crm_trace("%.8s not duplicate of %.8s: originator dead",
1051 op->id, other->id);
1052 other->state = st_failed;
1053 continue;
1054 }
1055 if ((other->total_timeout > 0)
1056 && (now > (other->total_timeout + other->created))) {
1057 crm_trace("%.8s not duplicate of %.8s: old (%lld vs. %lld + %ds)",
1058 op->id, other->id, (long long)now, (long long)other->created,
1059 other->total_timeout);
1060 continue;
1061 }
1062
1063
1064
1065
1066 other->duplicates = g_list_append(other->duplicates, op);
1067 if (other->total_timeout == 0) {
1068 other->total_timeout = op->total_timeout =
1069 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1070 crm_trace("Best guess as to timeout used for %.8s: %ds",
1071 other->id, other->total_timeout);
1072 }
1073 crm_notice("Merging fencing action '%s' targeting %s originating from "
1074 "client %s with identical request from %s@%s "
1075 CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1076 op->action, op->target, op->client_name,
1077 other->client_name, other->originator,
1078 op->id, other->id, other->total_timeout);
1079 report_timeout_period(op, other->total_timeout);
1080 op->state = st_duplicate;
1081 }
1082 }
1083
1084 static uint32_t fencing_active_peers(void)
1085 {
1086 uint32_t count = 0;
1087 crm_node_t *entry;
1088 GHashTableIter gIter;
1089
1090 g_hash_table_iter_init(&gIter, crm_peer_cache);
1091 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1092 if(fencing_peer_active(entry)) {
1093 count++;
1094 }
1095 }
1096 return count;
1097 }
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108 int
1109 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
1110 {
1111 remote_fencing_op_t *op = NULL;
1112 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, msg, LOG_ERR);
1113
1114 CRM_CHECK(dev != NULL, return EPROTO);
1115
1116 crm_notice("Received manual confirmation that %s has been fenced",
1117 pcmk__s(crm_element_value(dev, PCMK__XA_ST_TARGET),
1118 "unknown target"));
1119 op = initiate_remote_stonith_op(client, msg, TRUE);
1120 if (op == NULL) {
1121 return EPROTO;
1122 }
1123 op->state = st_done;
1124 set_fencing_completed(op);
1125 op->delegate = pcmk__str_copy("a human");
1126
1127
1128 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1129 finalize_op(op, msg, false);
1130
1131
1132
1133
1134 return EINPROGRESS;
1135 }
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148 void *
1149 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
1150 {
1151 remote_fencing_op_t *op = NULL;
1152 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request,
1153 LOG_NEVER);
1154 int call_options = 0;
1155 const char *operation = NULL;
1156
1157 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1158
1159
1160
1161 if (peer && dev) {
1162 const char *op_id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
1163
1164 CRM_CHECK(op_id != NULL, return NULL);
1165
1166 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1167 if (op) {
1168 crm_debug("Reusing existing remote fencing op %.8s for %s",
1169 op_id, ((client == NULL)? "unknown client" : client));
1170 return op;
1171 }
1172 }
1173
1174 op = pcmk__assert_alloc(1, sizeof(remote_fencing_op_t));
1175
1176 crm_element_value_int(request, PCMK__XA_ST_TIMEOUT, &(op->base_timeout));
1177
1178 crm_element_value_int(request, PCMK__XA_ST_DELAY, &(op->client_delay));
1179
1180 if (peer && dev) {
1181 op->id = crm_element_value_copy(dev, PCMK__XA_ST_REMOTE_OP);
1182 } else {
1183 op->id = crm_generate_uuid();
1184 }
1185
1186 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1187
1188 op->state = st_query;
1189 op->replies_expected = fencing_active_peers();
1190 op->action = crm_element_value_copy(dev, PCMK__XA_ST_DEVICE_ACTION);
1191
1192
1193
1194
1195
1196
1197
1198 op->originator = crm_element_value_copy(dev, PCMK__XA_ST_ORIGIN);
1199 if (op->originator == NULL) {
1200
1201 op->originator = pcmk__str_copy(stonith_our_uname);
1202 }
1203
1204
1205 op->delegate = crm_element_value_copy(dev, PCMK__XA_ST_DELEGATE);
1206 op->created = time(NULL);
1207
1208 CRM_LOG_ASSERT(client != NULL);
1209 op->client_id = pcmk__str_copy(client);
1210
1211
1212 operation = crm_element_value(request, PCMK__XA_ST_OP);
1213
1214 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1215 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1216 (unsigned long) getpid());
1217 } else {
1218 op->client_name = crm_element_value_copy(request,
1219 PCMK__XA_ST_CLIENTNAME);
1220 }
1221
1222 op->target = crm_element_value_copy(dev, PCMK__XA_ST_TARGET);
1223
1224
1225 op->request = pcmk__xml_copy(NULL, request);
1226 crm_element_value_int(request, PCMK__XA_ST_CALLOPT, &call_options);
1227 op->call_options = call_options;
1228
1229 crm_element_value_int(request, PCMK__XA_ST_CALLID, &(op->client_callid));
1230
1231 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1232 "base timeout %ds, %u %s expected)",
1233 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1234 op->target, op->client_name, op->base_timeout,
1235 op->replies_expected,
1236 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1237
1238 if (op->call_options & st_opt_cs_nodeid) {
1239 int nodeid;
1240 crm_node_t *node;
1241
1242 pcmk__scan_min_int(op->target, &nodeid, 0);
1243 node = pcmk__search_node_caches(nodeid, NULL,
1244 pcmk__node_search_any
1245 |pcmk__node_search_cluster_cib);
1246
1247
1248 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1249
1250 if (node && node->uname) {
1251 pcmk__str_update(&(op->target), node->uname);
1252
1253 } else {
1254 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1255 }
1256 }
1257
1258
1259 merge_duplicates(op);
1260
1261 if (op->state != st_duplicate) {
1262
1263 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
1264 }
1265
1266
1267 stonith_fence_history_trim();
1268
1269 return op;
1270 }
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282 remote_fencing_op_t *
1283 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
1284 gboolean manual_ack)
1285 {
1286 int query_timeout = 0;
1287 xmlNode *query = NULL;
1288 const char *client_id = NULL;
1289 remote_fencing_op_t *op = NULL;
1290 const char *relay_op_id = NULL;
1291 const char *operation = NULL;
1292
1293 if (client) {
1294 client_id = client->id;
1295 } else {
1296 client_id = crm_element_value(request, PCMK__XA_ST_CLIENTID);
1297 }
1298
1299 CRM_LOG_ASSERT(client_id != NULL);
1300 op = create_remote_stonith_op(client_id, request, FALSE);
1301 op->owner = TRUE;
1302 if (manual_ack) {
1303 return op;
1304 }
1305
1306 CRM_CHECK(op->action, return NULL);
1307
1308 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1309 op->state = st_failed;
1310 }
1311
1312 switch (op->state) {
1313 case st_failed:
1314
1315 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1316 "All topology levels failed");
1317 crm_warn("Could not request peer fencing (%s) targeting %s "
1318 CRM_XS " id=%.8s", op->action, op->target, op->id);
1319 finalize_op(op, NULL, false);
1320 return op;
1321
1322 case st_duplicate:
1323 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1324 CRM_XS " id=%.8s", op->action, op->target, op->id);
1325 return op;
1326
1327 default:
1328 crm_notice("Requesting peer fencing (%s) targeting %s "
1329 CRM_XS " id=%.8s state=%s base_timeout=%ds",
1330 op->action, op->target, op->id,
1331 stonith_op_state_str(op->state), op->base_timeout);
1332 }
1333
1334 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1335 NULL, op->call_options);
1336
1337 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP, op->id);
1338 crm_xml_add(query, PCMK__XA_ST_TARGET, op->target);
1339 crm_xml_add(query, PCMK__XA_ST_DEVICE_ACTION, op_requested_action(op));
1340 crm_xml_add(query, PCMK__XA_ST_ORIGIN, op->originator);
1341 crm_xml_add(query, PCMK__XA_ST_CLIENTID, op->client_id);
1342 crm_xml_add(query, PCMK__XA_ST_CLIENTNAME, op->client_name);
1343 crm_xml_add_int(query, PCMK__XA_ST_TIMEOUT, op->base_timeout);
1344
1345
1346 operation = crm_element_value(request, PCMK__XA_ST_OP);
1347 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1348 relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP);
1349 if (relay_op_id) {
1350 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP_RELAY, relay_op_id);
1351 }
1352 }
1353
1354 pcmk__cluster_send_message(NULL, crm_msg_stonith_ng, query);
1355 free_xml(query);
1356
1357 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1358 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1359
1360 return op;
1361 }
1362
1363 enum find_best_peer_options {
1364
1365 FIND_PEER_SKIP_TARGET = 0x0001,
1366
1367 FIND_PEER_TARGET_ONLY = 0x0002,
1368
1369 FIND_PEER_VERIFIED_ONLY = 0x0004,
1370 };
1371
1372 static bool
1373 is_watchdog_fencing(const remote_fencing_op_t *op, const char *device)
1374 {
1375 return (stonith_watchdog_timeout_ms > 0
1376
1377 && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_null_matches)
1378 && pcmk__is_fencing_action(op->action)
1379 && node_does_watchdog_fencing(op->target));
1380 }
1381
1382 static peer_device_info_t *
1383 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1384 {
1385 GList *iter = NULL;
1386 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1387
1388 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1389 return NULL;
1390 }
1391
1392 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1393 peer_device_info_t *peer = iter->data;
1394
1395 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1396 peer->host, op->target, peer->ndevices,
1397 pcmk__plural_s(peer->ndevices), peer->tried, options);
1398 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1399 continue;
1400 }
1401 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1402 continue;
1403 }
1404
1405 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1406
1407 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1408 return peer;
1409 }
1410
1411 } else if (!peer->tried
1412 && count_peer_devices(op, peer, verified_devices_only,
1413 fenced_support_flag(op->action))) {
1414
1415 crm_trace("Simple fencing");
1416 return peer;
1417 }
1418 }
1419
1420 return NULL;
1421 }
1422
1423 static peer_device_info_t *
1424 stonith_choose_peer(remote_fencing_op_t * op)
1425 {
1426 const char *device = NULL;
1427 peer_device_info_t *peer = NULL;
1428 uint32_t active = fencing_active_peers();
1429
1430 do {
1431 if (op->devices) {
1432 device = op->devices->data;
1433 crm_trace("Checking for someone to fence (%s) %s using %s",
1434 op->action, op->target, device);
1435 } else {
1436 crm_trace("Checking for someone to fence (%s) %s",
1437 op->action, op->target);
1438 }
1439
1440
1441 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1442 if (peer) {
1443 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1444 return peer;
1445 }
1446
1447 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1448 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1449 return NULL;
1450 }
1451
1452
1453 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1454 if (peer) {
1455 crm_trace("Found best unverified peer %s", peer->host);
1456 return peer;
1457 }
1458
1459
1460
1461
1462 if (op->phase != st_phase_on) {
1463 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1464 if (peer) {
1465 crm_trace("%s will fence itself", peer->host);
1466 return peer;
1467 }
1468 }
1469
1470
1471
1472
1473 } while ((op->phase != st_phase_on)
1474 && pcmk_is_set(op->call_options, st_opt_topology)
1475 && (advance_topology_level(op, false) == pcmk_rc_ok));
1476
1477
1478
1479
1480 if (is_watchdog_fencing(op, device)) {
1481 crm_info("Couldn't contact watchdog-fencing target-node (%s)",
1482 op->target);
1483
1484 } else {
1485 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1486 op->action, op->target, (device? device : "any device"));
1487 }
1488 return NULL;
1489 }
1490
1491 static int
1492 valid_fencing_timeout(int specified_timeout, bool action_specific,
1493 const remote_fencing_op_t *op, const char *device)
1494 {
1495 int timeout = specified_timeout;
1496
1497 if (!is_watchdog_fencing(op, device)) {
1498 return timeout;
1499 }
1500
1501 timeout = (int) QB_MIN(QB_MAX(specified_timeout,
1502 stonith_watchdog_timeout_ms / 1000), INT_MAX);
1503
1504 if (timeout > specified_timeout) {
1505 if (action_specific) {
1506 crm_warn("pcmk_%s_timeout %ds for %s is too short (must be >= "
1507 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1508 "instead",
1509 op->action, specified_timeout, device? device : "watchdog",
1510 timeout, timeout);
1511
1512 } else {
1513 crm_warn("Fencing timeout %ds is too short (must be >= "
1514 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1515 "instead",
1516 specified_timeout, timeout, timeout);
1517 }
1518 }
1519
1520 return timeout;
1521 }
1522
1523 static int
1524 get_device_timeout(const remote_fencing_op_t *op,
1525 const peer_device_info_t *peer, const char *device,
1526 bool with_delay)
1527 {
1528 int timeout = op->base_timeout;
1529 device_properties_t *props;
1530
1531 timeout = valid_fencing_timeout(op->base_timeout, false, op, device);
1532
1533 if (!peer || !device) {
1534 return timeout;
1535 }
1536
1537 props = g_hash_table_lookup(peer->devices, device);
1538 if (!props) {
1539 return timeout;
1540 }
1541
1542 if (props->custom_action_timeout[op->phase]) {
1543 timeout = valid_fencing_timeout(props->custom_action_timeout[op->phase],
1544 true, op, device);
1545 }
1546
1547
1548 if (with_delay && (op->client_delay >= 0)) {
1549
1550 timeout += (props->delay_max[op->phase] > 0 ?
1551 props->delay_max[op->phase] : props->delay_base[op->phase]);
1552 }
1553
1554 return timeout;
1555 }
1556
1557 struct timeout_data {
1558 const remote_fencing_op_t *op;
1559 const peer_device_info_t *peer;
1560 int total_timeout;
1561 };
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571 static void
1572 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1573 {
1574 const char *device_id = key;
1575 device_properties_t *props = value;
1576 struct timeout_data *timeout = user_data;
1577
1578 if (!props->executed[timeout->op->phase]
1579 && !props->disallowed[timeout->op->phase]) {
1580 timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer,
1581 device_id, true);
1582 }
1583 }
1584
1585 static int
1586 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
1587 {
1588 struct timeout_data timeout;
1589
1590 timeout.op = op;
1591 timeout.peer = peer;
1592 timeout.total_timeout = 0;
1593
1594 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1595
1596 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1597 }
1598
1599 static int
1600 get_op_total_timeout(const remote_fencing_op_t *op,
1601 const peer_device_info_t *chosen_peer)
1602 {
1603 long long total_timeout = 0;
1604 stonith_topology_t *tp = find_topology_for_host(op->target);
1605
1606 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1607 int i;
1608 GList *device_list = NULL;
1609 GList *iter = NULL;
1610 GList *auto_list = NULL;
1611
1612 if (pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)
1613 && (op->automatic_list != NULL)) {
1614 auto_list = g_list_copy(op->automatic_list);
1615 }
1616
1617
1618
1619
1620
1621
1622
1623
1624 for (i = 0; i < ST__LEVEL_COUNT; i++) {
1625 if (!tp->levels[i]) {
1626 continue;
1627 }
1628 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1629 bool found = false;
1630
1631 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1632 const peer_device_info_t *peer = iter->data;
1633
1634 if (auto_list) {
1635 GList *match = g_list_find_custom(auto_list, device_list->data,
1636 sort_strings);
1637 if (match) {
1638 auto_list = g_list_remove(auto_list, match->data);
1639 }
1640 }
1641
1642 if (find_peer_device(op, peer, device_list->data,
1643 fenced_support_flag(op->action))) {
1644 total_timeout += get_device_timeout(op, peer,
1645 device_list->data,
1646 true);
1647 found = true;
1648 break;
1649 }
1650 }
1651
1652
1653
1654
1655 if (!found && is_watchdog_fencing(op, device_list->data)) {
1656 total_timeout += stonith_watchdog_timeout_ms / 1000;
1657 }
1658 }
1659 }
1660
1661
1662 if (auto_list) {
1663 for (iter = auto_list; iter != NULL; iter = iter->next) {
1664 GList *iter2 = NULL;
1665
1666 for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
1667 peer_device_info_t *peer = iter2->data;
1668 if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
1669 total_timeout += get_device_timeout(op, peer,
1670 iter->data, true);
1671 break;
1672 }
1673 }
1674 }
1675 }
1676
1677 g_list_free(auto_list);
1678
1679 } else if (chosen_peer) {
1680 total_timeout = get_peer_timeout(op, chosen_peer);
1681
1682 } else {
1683 total_timeout = valid_fencing_timeout(op->base_timeout, false, op,
1684 NULL);
1685 }
1686
1687 if (total_timeout <= 0) {
1688 total_timeout = op->base_timeout;
1689 }
1690
1691
1692
1693
1694 if (op->client_delay > 0) {
1695 total_timeout += op->client_delay;
1696 }
1697 return (int) QB_MIN(total_timeout, INT_MAX);
1698 }
1699
1700 static void
1701 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1702 {
1703 GList *iter = NULL;
1704 xmlNode *update = NULL;
1705 const char *client_node = NULL;
1706 const char *client_id = NULL;
1707 const char *call_id = NULL;
1708
1709 if (op->call_options & st_opt_sync_call) {
1710
1711
1712
1713
1714 return;
1715 } else if (!op->request) {
1716 return;
1717 }
1718
1719 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1720 client_node = crm_element_value(op->request, PCMK__XA_ST_CLIENTNODE);
1721 call_id = crm_element_value(op->request, PCMK__XA_ST_CALLID);
1722 client_id = crm_element_value(op->request, PCMK__XA_ST_CLIENTID);
1723 if (!client_node || !call_id || !client_id) {
1724 return;
1725 }
1726
1727 if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1728
1729 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1730 return;
1731 }
1732
1733
1734 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1735 crm_xml_add(update, PCMK__XA_ST_REMOTE_OP, op->id);
1736 crm_xml_add(update, PCMK__XA_ST_CLIENTID, client_id);
1737 crm_xml_add(update, PCMK__XA_ST_CALLID, call_id);
1738 crm_xml_add_int(update, PCMK__XA_ST_TIMEOUT, op_timeout);
1739
1740 pcmk__cluster_send_message(pcmk__get_node(0, client_node, NULL,
1741 pcmk__node_search_cluster_member),
1742 crm_msg_stonith_ng, update);
1743
1744 free_xml(update);
1745
1746 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1747 remote_fencing_op_t *dup = iter->data;
1748
1749 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1750 dup->id, dup->client_name);
1751 report_timeout_period(iter->data, op_timeout);
1752 }
1753 }
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763 static void
1764 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1765 xmlNode *msg)
1766 {
1767
1768 if (op->devices) {
1769 op->devices = op->devices->next;
1770 }
1771
1772
1773 if ((op->phase == st_phase_requested)
1774 && pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)) {
1775
1776 remove_required_device(op, device);
1777
1778
1779
1780
1781 if (op->devices == NULL) {
1782 op->devices = op->automatic_list;
1783 }
1784 }
1785
1786 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1787
1788
1789
1790
1791 op_phase_on(op);
1792 }
1793
1794
1795 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1796
1797 if (op->devices) {
1798
1799 crm_trace("Next targeting %s on behalf of %s@%s",
1800 op->target, op->client_name, op->originator);
1801
1802
1803 if (op->client_delay > 0) {
1804 op->client_delay = 0;
1805 }
1806
1807 request_peer_fencing(op, NULL);
1808 } else {
1809
1810 crm_trace("Marking complex fencing op targeting %s as complete",
1811 op->target);
1812 op->state = st_done;
1813 finalize_op(op, msg, false);
1814 }
1815 }
1816
1817 static gboolean
1818 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
1819 {
1820 if (node_does_watchdog_fencing(op->target)) {
1821 guint timeout_ms = QB_MIN(stonith_watchdog_timeout_ms, UINT_MAX);
1822
1823 crm_notice("Waiting %s for %s to self-fence (%s) for "
1824 "client %s " CRM_XS " id=%.8s",
1825 pcmk__readable_interval(timeout_ms), op->target, op->action,
1826 op->client_name, op->id);
1827
1828 if (op->op_timer_one) {
1829 g_source_remove(op->op_timer_one);
1830 }
1831 op->op_timer_one = g_timeout_add(timeout_ms, remote_op_watchdog_done,
1832 op);
1833 return TRUE;
1834 } else {
1835 crm_debug("Skipping fallback to watchdog-fencing as %s is "
1836 "not in host-list", op->target);
1837 }
1838 return FALSE;
1839 }
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849 static void
1850 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
1851 {
1852 const char *device = NULL;
1853 int timeout;
1854
1855 CRM_CHECK(op != NULL, return);
1856
1857 crm_trace("Action %.8s targeting %s for %s is %s",
1858 op->id, op->target, op->client_name,
1859 stonith_op_state_str(op->state));
1860
1861 if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1862
1863
1864
1865
1866
1867
1868
1869
1870
1871
1872
1873 device = op->devices->data;
1874 if (pcmk__str_eq(fenced_device_reboot_action(device), PCMK_ACTION_OFF,
1875 pcmk__str_none)) {
1876 crm_info("Not turning %s back on using %s because the device is "
1877 "configured to stay off (pcmk_reboot_action='off')",
1878 op->target, device);
1879 advance_topology_device_in_level(op, device, NULL);
1880 return;
1881 }
1882 if (!fenced_device_supports_on(device)) {
1883 crm_info("Not turning %s back on using %s because the agent "
1884 "doesn't support 'on'", op->target, device);
1885 advance_topology_device_in_level(op, device, NULL);
1886 return;
1887 }
1888 }
1889
1890 timeout = op->base_timeout;
1891 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1892 peer = stonith_choose_peer(op);
1893 }
1894
1895 if (!op->op_timer_total) {
1896 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1897 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1898 report_timeout_period(op, op->total_timeout);
1899 crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s"
1900 CRM_XS "id=%.8s",
1901 op->total_timeout, op->target, op->client_name, op->id);
1902 }
1903
1904 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1905
1906
1907
1908
1909
1910
1911
1912
1913 peer = stonith_choose_peer(op);
1914
1915 device = op->devices->data;
1916
1917
1918
1919
1920 timeout = get_device_timeout(op, peer, device, false);
1921 }
1922
1923 if (peer) {
1924 int timeout_one = 0;
1925 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1926 const crm_node_t *peer_node =
1927 pcmk__get_node(0, peer->host, NULL,
1928 pcmk__node_search_cluster_member);
1929
1930 if (op->client_delay > 0) {
1931
1932
1933
1934 timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay;
1935 }
1936
1937 crm_xml_add(remote_op, PCMK__XA_ST_REMOTE_OP, op->id);
1938 crm_xml_add(remote_op, PCMK__XA_ST_TARGET, op->target);
1939 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ACTION, op->action);
1940 crm_xml_add(remote_op, PCMK__XA_ST_ORIGIN, op->originator);
1941 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTID, op->client_id);
1942 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTNAME, op->client_name);
1943 crm_xml_add_int(remote_op, PCMK__XA_ST_TIMEOUT, timeout);
1944 crm_xml_add_int(remote_op, PCMK__XA_ST_CALLOPT, op->call_options);
1945 crm_xml_add_int(remote_op, PCMK__XA_ST_DELAY, op->client_delay);
1946
1947 if (device) {
1948 timeout_one += TIMEOUT_MULTIPLY_FACTOR *
1949 get_device_timeout(op, peer, device, true);
1950 crm_notice("Requesting that %s perform '%s' action targeting %s "
1951 "using %s " CRM_XS " for client %s (%ds)",
1952 peer->host, op->action, op->target, device,
1953 op->client_name, timeout_one);
1954 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ID, device);
1955
1956 } else {
1957 timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1958 crm_notice("Requesting that %s perform '%s' action targeting %s "
1959 CRM_XS " for client %s (%ds, %s)",
1960 peer->host, op->action, op->target, op->client_name,
1961 timeout_one,
1962 pcmk__readable_interval(stonith_watchdog_timeout_ms));
1963 }
1964
1965 op->state = st_exec;
1966 if (op->op_timer_one) {
1967 g_source_remove(op->op_timer_one);
1968 op->op_timer_one = 0;
1969 }
1970
1971 if (!is_watchdog_fencing(op, device)
1972 || !check_watchdog_fencing_and_wait(op)) {
1973
1974
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1995 }
1996
1997 pcmk__cluster_send_message(peer_node, crm_msg_stonith_ng, remote_op);
1998 peer->tried = TRUE;
1999 free_xml(remote_op);
2000 return;
2001
2002 } else if (op->phase == st_phase_on) {
2003
2004
2005
2006 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
2007 "after successful 'off'", device, op->target);
2008 advance_topology_device_in_level(op, device, NULL);
2009 return;
2010
2011 } else if (op->owner == FALSE) {
2012 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
2013 op->action, op->target, op->client_name);
2014
2015 } else if (op->query_timer == 0) {
2016
2017 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
2018 CRM_XS " state=%s", op->action, op->target, op->client_name,
2019 stonith_op_state_str(op->state));
2020 CRM_CHECK(op->state < st_done, return);
2021 finalize_timed_out_op(op, "All nodes failed, or are unable, to "
2022 "fence target");
2023
2024 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
2025
2026
2027
2028
2029 if (is_watchdog_fencing(op, device)
2030 && check_watchdog_fencing_and_wait(op)) {
2031
2032
2033
2034
2035
2036 op->state = st_exec;
2037 return;
2038 }
2039
2040 if (op->state == st_query) {
2041 crm_info("No peers (out of %d) have devices capable of fencing "
2042 "(%s) %s for client %s " CRM_XS " state=%s",
2043 op->replies, op->action, op->target, op->client_name,
2044 stonith_op_state_str(op->state));
2045
2046 pcmk__reset_result(&op->result);
2047 pcmk__set_result(&op->result, CRM_EX_ERROR,
2048 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2049 } else {
2050 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2051 pcmk__reset_result(&op->result);
2052 pcmk__set_result(&op->result, CRM_EX_ERROR,
2053 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2054 }
2055
2056
2057
2058
2059
2060
2061
2062 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
2063 "for client %s " CRM_XS " state=%s",
2064 op->replies, op->action, op->target, op->client_name,
2065 stonith_op_state_str(op->state));
2066 }
2067
2068 op->state = st_failed;
2069 finalize_op(op, NULL, false);
2070
2071 } else {
2072 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
2073 "for client %s " CRM_XS " id=%.8s",
2074 op->action, op->target, (device? " using " : ""),
2075 (device? device : ""), op->client_name, op->id);
2076 }
2077 }
2078
2079
2080
2081
2082
2083
2084
2085
2086
2087
2088
2089
2090 static gint
2091 sort_peers(gconstpointer a, gconstpointer b)
2092 {
2093 const peer_device_info_t *peer_a = a;
2094 const peer_device_info_t *peer_b = b;
2095
2096 return (peer_b->ndevices - peer_a->ndevices);
2097 }
2098
2099
2100
2101
2102
2103
2104
2105 static gboolean
2106 all_topology_devices_found(const remote_fencing_op_t *op)
2107 {
2108 GList *device = NULL;
2109 GList *iter = NULL;
2110 device_properties_t *match = NULL;
2111 stonith_topology_t *tp = NULL;
2112 gboolean skip_target = FALSE;
2113 int i;
2114
2115 tp = find_topology_for_host(op->target);
2116 if (!tp) {
2117 return FALSE;
2118 }
2119 if (pcmk__is_fencing_action(op->action)) {
2120
2121
2122 skip_target = TRUE;
2123 }
2124
2125 for (i = 0; i < ST__LEVEL_COUNT; i++) {
2126 for (device = tp->levels[i]; device; device = device->next) {
2127 match = NULL;
2128 for (iter = op->query_results; iter && !match; iter = iter->next) {
2129 peer_device_info_t *peer = iter->data;
2130
2131 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
2132 continue;
2133 }
2134 match = find_peer_device(op, peer, device->data, st_device_supports_none);
2135 }
2136 if (!match) {
2137 return FALSE;
2138 }
2139 }
2140 }
2141
2142 return TRUE;
2143 }
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
2154
2155
2156
2157 static void
2158 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
2159 const char *action, remote_fencing_op_t *op,
2160 enum st_remap_phase phase, device_properties_t *props)
2161 {
2162 props->custom_action_timeout[phase] = 0;
2163 crm_element_value_int(xml, PCMK__XA_ST_ACTION_TIMEOUT,
2164 &props->custom_action_timeout[phase]);
2165 if (props->custom_action_timeout[phase]) {
2166 crm_trace("Peer %s with device %s returned %s action timeout %ds",
2167 peer, device, action, props->custom_action_timeout[phase]);
2168 }
2169
2170 props->delay_max[phase] = 0;
2171 crm_element_value_int(xml, PCMK__XA_ST_DELAY_MAX, &props->delay_max[phase]);
2172 if (props->delay_max[phase]) {
2173 crm_trace("Peer %s with device %s returned maximum of random delay %ds for %s",
2174 peer, device, props->delay_max[phase], action);
2175 }
2176
2177 props->delay_base[phase] = 0;
2178 crm_element_value_int(xml, PCMK__XA_ST_DELAY_BASE,
2179 &props->delay_base[phase]);
2180 if (props->delay_base[phase]) {
2181 crm_trace("Peer %s with device %s returned base delay %ds for %s",
2182 peer, device, props->delay_base[phase], action);
2183 }
2184
2185
2186 if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
2187 int required = 0;
2188
2189 crm_element_value_int(xml, PCMK__XA_ST_REQUIRED, &required);
2190 if (required) {
2191 crm_trace("Peer %s requires device %s to execute for action %s",
2192 peer, device, action);
2193 add_required_device(op, device);
2194 }
2195 }
2196
2197
2198
2199
2200 if (pcmk__xe_attr_is_true(xml, PCMK__XA_ST_ACTION_DISALLOWED)) {
2201 props->disallowed[phase] = TRUE;
2202 crm_trace("Peer %s is disallowed from executing %s for device %s",
2203 peer, action, device);
2204 }
2205 }
2206
2207
2208
2209
2210
2211
2212
2213
2214
2215
2216 static void
2217 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
2218 peer_device_info_t *peer, const char *device)
2219 {
2220 xmlNode *child;
2221 int verified = 0;
2222 device_properties_t *props =
2223 pcmk__assert_alloc(1, sizeof(device_properties_t));
2224 int flags = st_device_supports_on;
2225
2226
2227 g_hash_table_insert(peer->devices, pcmk__str_copy(device), props);
2228
2229
2230 crm_element_value_int(xml, PCMK__XA_ST_MONITOR_VERIFIED, &verified);
2231 if (verified) {
2232 crm_trace("Peer %s has confirmed a verified device %s",
2233 peer->host, device);
2234 props->verified = TRUE;
2235 }
2236
2237 crm_element_value_int(xml, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS, &flags);
2238 props->device_support_flags = flags;
2239
2240
2241 parse_action_specific(xml, peer->host, device, op_requested_action(op),
2242 op, st_phase_requested, props);
2243 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2244 child = pcmk__xe_next(child)) {
2245
2246
2247
2248
2249 if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_OFF, pcmk__str_none)) {
2250 parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF,
2251 op, st_phase_off, props);
2252
2253 } else if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_ON,
2254 pcmk__str_none)) {
2255 parse_action_specific(child, peer->host, device, PCMK_ACTION_ON,
2256 op, st_phase_on, props);
2257 }
2258 }
2259 }
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270
2271
2272 static peer_device_info_t *
2273 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
2274 const xmlNode *xml)
2275 {
2276 peer_device_info_t *peer = pcmk__assert_alloc(1,
2277 sizeof(peer_device_info_t));
2278 xmlNode *child;
2279
2280 peer->host = pcmk__str_copy(host);
2281 peer->devices = pcmk__strkey_table(free, free);
2282
2283
2284 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2285 child = pcmk__xe_next(child)) {
2286 const char *device = pcmk__xe_id(child);
2287
2288 if (device) {
2289 add_device_properties(child, op, peer, device);
2290 }
2291 }
2292
2293 peer->ndevices = g_hash_table_size(peer->devices);
2294 CRM_CHECK(ndevices == peer->ndevices,
2295 crm_err("Query claimed to have %d device%s but %d found",
2296 ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2297
2298 op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2299 return peer;
2300 }
2301
2302
2303
2304
2305
2306
2307
2308
2309
2310
2311
2312
2313
2314
2315
2316 int
2317 process_remote_stonith_query(xmlNode *msg)
2318 {
2319 int ndevices = 0;
2320 gboolean host_is_target = FALSE;
2321 gboolean have_all_replies = FALSE;
2322 const char *id = NULL;
2323 const char *host = NULL;
2324 remote_fencing_op_t *op = NULL;
2325 peer_device_info_t *peer = NULL;
2326 uint32_t replies_expected;
2327 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
2328
2329 CRM_CHECK(dev != NULL, return -EPROTO);
2330
2331 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2332 CRM_CHECK(id != NULL, return -EPROTO);
2333
2334 dev = get_xpath_object("//@" PCMK__XA_ST_AVAILABLE_DEVICES, msg, LOG_ERR);
2335 CRM_CHECK(dev != NULL, return -EPROTO);
2336 crm_element_value_int(dev, PCMK__XA_ST_AVAILABLE_DEVICES, &ndevices);
2337
2338 op = g_hash_table_lookup(stonith_remote_op_list, id);
2339 if (op == NULL) {
2340 crm_debug("Received query reply for unknown or expired operation %s",
2341 id);
2342 return -EOPNOTSUPP;
2343 }
2344
2345 replies_expected = fencing_active_peers();
2346 if (op->replies_expected < replies_expected) {
2347 replies_expected = op->replies_expected;
2348 }
2349 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2350 have_all_replies = TRUE;
2351 }
2352 host = crm_element_value(msg, PCMK__XA_SRC);
2353 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2354
2355 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2356 op->replies, replies_expected, host,
2357 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2358 if (ndevices > 0) {
2359 peer = add_result(op, host, ndevices, dev);
2360 }
2361
2362 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2363
2364 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2365
2366
2367
2368 if (op->state == st_query && all_topology_devices_found(op)) {
2369
2370 crm_trace("All topology devices found");
2371 request_peer_fencing(op, peer);
2372
2373 } else if (have_all_replies) {
2374 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2375 replies_expected, op->replies);
2376 request_peer_fencing(op, NULL);
2377 }
2378
2379 } else if (op->state == st_query) {
2380 int nverified = count_peer_devices(op, peer, TRUE,
2381 fenced_support_flag(op->action));
2382
2383
2384
2385 if ((peer != NULL) && !host_is_target && nverified) {
2386
2387 crm_trace("Found %d verified device%s",
2388 nverified, pcmk__plural_s(nverified));
2389 request_peer_fencing(op, peer);
2390
2391 } else if (have_all_replies) {
2392 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2393 replies_expected, op->replies);
2394 request_peer_fencing(op, NULL);
2395
2396 } else {
2397 crm_trace("Waiting for more peer results before launching fencing operation");
2398 }
2399
2400 } else if ((peer != NULL) && (op->state == st_done)) {
2401 crm_info("Discarding query result from %s (%d device%s): "
2402 "Operation is %s", peer->host,
2403 peer->ndevices, pcmk__plural_s(peer->ndevices),
2404 stonith_op_state_str(op->state));
2405 }
2406
2407 return pcmk_ok;
2408 }
2409
2410
2411
2412
2413
2414
2415
2416
2417
2418
2419 void
2420 fenced_process_fencing_reply(xmlNode *msg)
2421 {
2422 const char *id = NULL;
2423 const char *device = NULL;
2424 remote_fencing_op_t *op = NULL;
2425 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
2426 pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2427
2428 CRM_CHECK(dev != NULL, return);
2429
2430 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2431 CRM_CHECK(id != NULL, return);
2432
2433 dev = stonith__find_xe_with_result(msg);
2434 CRM_CHECK(dev != NULL, return);
2435
2436 stonith__xe_get_result(dev, &result);
2437
2438 device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID);
2439
2440 if (stonith_remote_op_list) {
2441 op = g_hash_table_lookup(stonith_remote_op_list, id);
2442 }
2443
2444 if ((op == NULL) && pcmk__result_ok(&result)) {
2445
2446 const char *client_id = crm_element_value(dev, PCMK__XA_ST_CLIENTID);
2447
2448 op = create_remote_stonith_op(client_id, dev, TRUE);
2449 }
2450
2451 if (op == NULL) {
2452
2453
2454 crm_info("Received peer result of unknown or expired operation %s", id);
2455 pcmk__reset_result(&result);
2456 return;
2457 }
2458
2459 pcmk__reset_result(&op->result);
2460 op->result = result;
2461
2462 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2463 crm_err("Received outdated reply for device %s (instead of %s) to "
2464 "fence (%s) %s. Operation already timed out at peer level.",
2465 device, (const char *) op->devices->data, op->action, op->target);
2466 return;
2467 }
2468
2469 if (pcmk__str_eq(crm_element_value(msg, PCMK__XA_SUBT),
2470 PCMK__VALUE_BROADCAST, pcmk__str_none)) {
2471
2472 if (pcmk__result_ok(&op->result)) {
2473 op->state = st_done;
2474 } else {
2475 op->state = st_failed;
2476 }
2477 finalize_op(op, msg, false);
2478 return;
2479
2480 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2481
2482
2483 crm_err("Received non-broadcast fencing result for operation %.8s "
2484 "we do not own (device %s targeting %s)",
2485 op->id, device, op->target);
2486 return;
2487 }
2488
2489 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2490 const char *device = NULL;
2491 const char *reason = op->result.exit_reason;
2492
2493
2494
2495 if (op->state == st_done) {
2496 finalize_op(op, msg, false);
2497 return;
2498 }
2499
2500 device = crm_element_value(msg, PCMK__XA_ST_DEVICE_ID);
2501
2502 if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2503
2504
2505
2506 crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2507 "after successful 'off'",
2508 device, pcmk_exec_status_str(op->result.execution_status),
2509 (reason == NULL)? "" : ": ",
2510 (reason == NULL)? "" : reason,
2511 op->target);
2512 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2513 } else {
2514 crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2515 "%s%s%s%s",
2516 op->action, op->target,
2517 ((device == NULL)? "" : " using "),
2518 ((device == NULL)? "" : device),
2519 op->client_name,
2520 op->originator,
2521 pcmk_exec_status_str(op->result.execution_status),
2522 (reason == NULL)? "" : " (",
2523 (reason == NULL)? "" : reason,
2524 (reason == NULL)? "" : ")");
2525 }
2526
2527 if (pcmk__result_ok(&op->result)) {
2528
2529
2530 advance_topology_device_in_level(op, device, msg);
2531 return;
2532 } else {
2533
2534
2535 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2536 op->state = st_failed;
2537 finalize_op(op, msg, false);
2538 return;
2539 }
2540 }
2541
2542 } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2543 op->state = st_done;
2544 finalize_op(op, msg, false);
2545 return;
2546
2547 } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2548 && (op->devices == NULL)) {
2549
2550 op->state = st_failed;
2551 finalize_op(op, msg, false);
2552 return;
2553
2554 } else {
2555
2556 }
2557
2558
2559 crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2560 op->target, op->originator, op->client_name,
2561 pcmk_exec_status_str(op->result.execution_status));
2562 request_peer_fencing(op, NULL);
2563 }
2564
2565 gboolean
2566 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2567 {
2568 GHashTableIter iter;
2569 time_t now = time(NULL);
2570 remote_fencing_op_t *rop = NULL;
2571
2572 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2573 action == NULL) {
2574 return FALSE;
2575 }
2576
2577 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2578 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2579 if (strcmp(rop->target, target) != 0) {
2580 continue;
2581 } else if (rop->state != st_done) {
2582 continue;
2583
2584
2585
2586 } else if (strcmp(rop->action, action) != 0) {
2587 continue;
2588 } else if ((rop->completed + tolerance) < now) {
2589 continue;
2590 }
2591
2592 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2593 target, action, tolerance, rop->delegate, rop->originator);
2594 return TRUE;
2595 }
2596 return FALSE;
2597 }