This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- fencing_result2xml
- fenced_broadcast_op_result
- handle_local_reply_and_notify
- finalize_op_duplicates
- delegate_from_xml
- finalize_op
- remote_op_watchdog_done
- remote_op_timeout_one
- finalize_timed_out_op
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- fenced_handle_manual_confirmation
- create_remote_stonith_op
- initiate_remote_stonith_op
- is_watchdog_fencing
- find_best_peer
- stonith_choose_peer
- valid_fencing_timeout
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- check_watchdog_fencing_and_wait
- request_peer_fencing
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- fenced_process_fencing_reply
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/common/ipc.h>
28 #include <crm/common/ipc_internal.h>
29 #include <crm/cluster/internal.h>
30
31 #include <crm/stonith-ng.h>
32 #include <crm/fencing/internal.h>
33 #include <crm/common/xml.h>
34 #include <crm/common/xml_internal.h>
35
36 #include <crm/common/util.h>
37 #include <pacemaker-fenced.h>
38
39 #define TIMEOUT_MULTIPLY_FACTOR 1.2
40
41
42
43
44
45
46
47 typedef struct device_properties_s {
48
49 gboolean verified;
50
51
52
53
54 gboolean executed[st_phase_max];
55
56 gboolean disallowed[st_phase_max];
57
58 int custom_action_timeout[st_phase_max];
59
60 int delay_max[st_phase_max];
61
62 int delay_base[st_phase_max];
63
64 uint32_t device_support_flags;
65 } device_properties_t;
66
67 typedef struct {
68
69 char *host;
70
71 gboolean tried;
72
73 int ndevices;
74
75 GHashTable *devices;
76 } peer_device_info_t;
77
78 GHashTable *stonith_remote_op_list = NULL;
79
80 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
81 int call_options);
82
83 static void request_peer_fencing(remote_fencing_op_t *op,
84 peer_device_info_t *peer);
85 static void finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup);
86 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
87 static int get_op_total_timeout(const remote_fencing_op_t *op,
88 const peer_device_info_t *chosen_peer);
89
90 static gint
91 sort_strings(gconstpointer a, gconstpointer b)
92 {
93 return strcmp(a, b);
94 }
95
96 static void
97 free_remote_query(gpointer data)
98 {
99 if (data != NULL) {
100 peer_device_info_t *peer = data;
101
102 g_hash_table_destroy(peer->devices);
103 free(peer->host);
104 free(peer);
105 }
106 }
107
108 void
109 free_stonith_remote_op_list(void)
110 {
111 if (stonith_remote_op_list != NULL) {
112 g_hash_table_destroy(stonith_remote_op_list);
113 stonith_remote_op_list = NULL;
114 }
115 }
116
117 struct peer_count_data {
118 const remote_fencing_op_t *op;
119 gboolean verified_only;
120 uint32_t support_action_only;
121 int count;
122 };
123
124
125
126
127
128
129
130
131
132 static void
133 count_peer_device(gpointer key, gpointer value, gpointer user_data)
134 {
135 device_properties_t *props = (device_properties_t*)value;
136 struct peer_count_data *data = user_data;
137
138 if (!props->executed[data->op->phase]
139 && (!data->verified_only || props->verified)
140 && ((data->support_action_only == st_device_supports_none) || pcmk_is_set(props->device_support_flags, data->support_action_only))) {
141 ++(data->count);
142 }
143 }
144
145
146
147
148
149
150
151
152
153
154
155
156 static int
157 count_peer_devices(const remote_fencing_op_t *op,
158 const peer_device_info_t *peer, gboolean verified_only, uint32_t support_on_action_only)
159 {
160 struct peer_count_data data;
161
162 data.op = op;
163 data.verified_only = verified_only;
164 data.support_action_only = support_on_action_only;
165 data.count = 0;
166 if (peer) {
167 g_hash_table_foreach(peer->devices, count_peer_device, &data);
168 }
169 return data.count;
170 }
171
172
173
174
175
176
177
178
179
180
181
182 static device_properties_t *
183 find_peer_device(const remote_fencing_op_t *op, const peer_device_info_t *peer,
184 const char *device, uint32_t support_action_only)
185 {
186 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
187
188 if (props && support_action_only != st_device_supports_none && !pcmk_is_set(props->device_support_flags, support_action_only)) {
189 return NULL;
190 }
191 return (props && !props->executed[op->phase]
192 && !props->disallowed[op->phase])? props : NULL;
193 }
194
195
196
197
198
199
200
201
202
203
204
205
206 static gboolean
207 grab_peer_device(const remote_fencing_op_t *op, peer_device_info_t *peer,
208 const char *device, gboolean verified_devices_only)
209 {
210 device_properties_t *props = find_peer_device(op, peer, device,
211 fenced_support_flag(op->action));
212
213 if ((props == NULL) || (verified_devices_only && !props->verified)) {
214 return FALSE;
215 }
216
217 crm_trace("Removing %s from %s (%d remaining)",
218 device, peer->host, count_peer_devices(op, peer, FALSE, st_device_supports_none));
219 props->executed[op->phase] = TRUE;
220 return TRUE;
221 }
222
223 static void
224 clear_remote_op_timers(remote_fencing_op_t * op)
225 {
226 if (op->query_timer) {
227 g_source_remove(op->query_timer);
228 op->query_timer = 0;
229 }
230 if (op->op_timer_total) {
231 g_source_remove(op->op_timer_total);
232 op->op_timer_total = 0;
233 }
234 if (op->op_timer_one) {
235 g_source_remove(op->op_timer_one);
236 op->op_timer_one = 0;
237 }
238 }
239
240 static void
241 free_remote_op(gpointer data)
242 {
243 remote_fencing_op_t *op = data;
244
245 crm_log_xml_debug(op->request, "Destroying");
246
247 clear_remote_op_timers(op);
248
249 free(op->id);
250 free(op->action);
251 free(op->delegate);
252 free(op->target);
253 free(op->client_id);
254 free(op->client_name);
255 free(op->originator);
256
257 if (op->query_results) {
258 g_list_free_full(op->query_results, free_remote_query);
259 }
260 if (op->request) {
261 pcmk__xml_free(op->request);
262 op->request = NULL;
263 }
264 if (op->devices_list) {
265 g_list_free_full(op->devices_list, free);
266 op->devices_list = NULL;
267 }
268 g_list_free_full(op->automatic_list, free);
269 g_list_free(op->duplicates);
270
271 pcmk__reset_result(&op->result);
272 free(op);
273 }
274
275 void
276 init_stonith_remote_op_hash_table(GHashTable **table)
277 {
278 if (*table == NULL) {
279 *table = pcmk__strkey_table(NULL, free_remote_op);
280 }
281 }
282
283
284
285
286
287
288
289
290
291 static const char *
292 op_requested_action(const remote_fencing_op_t *op)
293 {
294 return ((op->phase > st_phase_requested)? PCMK_ACTION_REBOOT : op->action);
295 }
296
297
298
299
300
301
302
303 static void
304 op_phase_off(remote_fencing_op_t *op)
305 {
306 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
307 QB_XS " id=%.8s", op->target, op->id);
308 op->phase = st_phase_off;
309
310
311
312
313 strcpy(op->action, PCMK_ACTION_OFF);
314 }
315
316
317
318
319
320
321
322 static void
323 op_phase_on(remote_fencing_op_t *op)
324 {
325 GList *iter = NULL;
326
327 crm_info("Remapped 'off' targeting %s complete, "
328 "remapping to 'on' for %s " QB_XS " id=%.8s",
329 op->target, op->client_name, op->id);
330 op->phase = st_phase_on;
331 strcpy(op->action, PCMK_ACTION_ON);
332
333
334
335
336 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
337 GList *match = g_list_find_custom(op->devices_list, iter->data,
338 sort_strings);
339
340 if (match) {
341 op->devices_list = g_list_remove(op->devices_list, match->data);
342 }
343 }
344 g_list_free_full(op->automatic_list, free);
345 op->automatic_list = NULL;
346
347
348 op->devices = op->devices_list;
349 }
350
351
352
353
354
355
356
357 static void
358 undo_op_remap(remote_fencing_op_t *op)
359 {
360 if (op->phase > 0) {
361 crm_info("Undoing remap of reboot targeting %s for %s "
362 QB_XS " id=%.8s", op->target, op->client_name, op->id);
363 op->phase = st_phase_requested;
364 strcpy(op->action, PCMK_ACTION_REBOOT);
365 }
366 }
367
368
369
370
371
372
373
374
375
376
377
378 static xmlNode *
379 fencing_result2xml(xmlNode *parent, const remote_fencing_op_t *op)
380 {
381 xmlNode *notify_data = pcmk__xe_create(parent, PCMK__XE_ST_NOTIFY_FENCE);
382
383 crm_xml_add_int(notify_data, PCMK_XA_STATE, op->state);
384 crm_xml_add(notify_data, PCMK__XA_ST_TARGET, op->target);
385 crm_xml_add(notify_data, PCMK__XA_ST_DEVICE_ACTION, op->action);
386 crm_xml_add(notify_data, PCMK__XA_ST_DELEGATE, op->delegate);
387 crm_xml_add(notify_data, PCMK__XA_ST_REMOTE_OP, op->id);
388 crm_xml_add(notify_data, PCMK__XA_ST_ORIGIN, op->originator);
389 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTID, op->client_id);
390 crm_xml_add(notify_data, PCMK__XA_ST_CLIENTNAME, op->client_name);
391
392 return notify_data;
393 }
394
395
396
397
398
399
400
401
402 void
403 fenced_broadcast_op_result(const remote_fencing_op_t *op, bool op_merged)
404 {
405 static int count = 0;
406 xmlNode *bcast = pcmk__xe_create(NULL, PCMK__XE_ST_REPLY);
407 xmlNode *wrapper = NULL;
408 xmlNode *notify_data = NULL;
409
410 count++;
411 crm_trace("Broadcasting result to peers");
412 crm_xml_add(bcast, PCMK__XA_T, PCMK__VALUE_ST_NOTIFY);
413 crm_xml_add(bcast, PCMK__XA_SUBT, PCMK__VALUE_BROADCAST);
414 crm_xml_add(bcast, PCMK__XA_ST_OP, STONITH_OP_NOTIFY);
415 crm_xml_add_int(bcast, PCMK_XA_COUNT, count);
416
417 if (op_merged) {
418 pcmk__xe_set_bool_attr(bcast, PCMK__XA_ST_OP_MERGED, true);
419 }
420
421 wrapper = pcmk__xe_create(bcast, PCMK__XE_ST_CALLDATA);
422 notify_data = fencing_result2xml(wrapper, op);
423 stonith__xe_set_result(notify_data, &op->result);
424
425 pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, bcast);
426 pcmk__xml_free(bcast);
427
428 return;
429 }
430
431
432
433
434
435
436
437
438 static void
439 handle_local_reply_and_notify(remote_fencing_op_t *op, xmlNode *data)
440 {
441 xmlNode *notify_data = NULL;
442 xmlNode *reply = NULL;
443 pcmk__client_t *client = NULL;
444
445 if (op->notify_sent == TRUE) {
446
447 return;
448 }
449
450
451 crm_xml_add_int(data, PCMK_XA_STATE, op->state);
452 crm_xml_add(data, PCMK__XA_ST_TARGET, op->target);
453 crm_xml_add(data, PCMK__XA_ST_OP, op->action);
454
455 reply = fenced_construct_reply(op->request, data, &op->result);
456 crm_xml_add(reply, PCMK__XA_ST_DELEGATE, op->delegate);
457
458
459 client = pcmk__find_client_by_id(op->client_id);
460 if (client == NULL) {
461 crm_trace("Skipping reply to %s: no longer a client", op->client_id);
462 } else {
463 do_local_reply(reply, client, op->call_options);
464 }
465
466
467 notify_data = fencing_result2xml(NULL, op);
468 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_FENCE, &op->result,
469 notify_data);
470 pcmk__xml_free(notify_data);
471 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
472
473
474 op->notify_sent = TRUE;
475 pcmk__xml_free(reply);
476 }
477
478
479
480
481
482
483
484
485 static void
486 finalize_op_duplicates(remote_fencing_op_t *op, xmlNode *data)
487 {
488 for (GList *iter = op->duplicates; iter != NULL; iter = iter->next) {
489 remote_fencing_op_t *other = iter->data;
490
491 if (other->state == st_duplicate) {
492 other->state = op->state;
493 crm_debug("Performing duplicate notification for %s@%s: %s "
494 QB_XS " id=%.8s",
495 other->client_name, other->originator,
496 pcmk_exec_status_str(op->result.execution_status),
497 other->id);
498 pcmk__copy_result(&op->result, &other->result);
499 finalize_op(other, data, true);
500
501 } else {
502
503 crm_err("Skipping duplicate notification for %s@%s "
504 QB_XS " state=%s id=%.8s",
505 other->client_name, other->originator,
506 stonith_op_state_str(other->state), other->id);
507 }
508 }
509 }
510
511 static char *
512 delegate_from_xml(xmlNode *xml)
513 {
514 xmlNode *match = get_xpath_object("//@" PCMK__XA_ST_DELEGATE, xml,
515 LOG_NEVER);
516
517 if (match == NULL) {
518 return crm_element_value_copy(xml, PCMK__XA_SRC);
519 } else {
520 return crm_element_value_copy(match, PCMK__XA_ST_DELEGATE);
521 }
522 }
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540 static void
541 finalize_op(remote_fencing_op_t *op, xmlNode *data, bool dup)
542 {
543 int level = LOG_ERR;
544 const char *subt = NULL;
545 xmlNode *local_data = NULL;
546 gboolean op_merged = FALSE;
547
548 CRM_CHECK((op != NULL), return);
549
550
551 clear_remote_op_timers(op);
552
553 if (op->notify_sent) {
554
555 crm_notice("Operation '%s'%s%s by %s for %s@%s%s: "
556 "Result arrived too late " QB_XS " id=%.8s",
557 op->action, (op->target? " targeting " : ""),
558 (op->target? op->target : ""),
559 (op->delegate? op->delegate : "unknown node"),
560 op->client_name, op->originator,
561 (op_merged? " (merged)" : ""),
562 op->id);
563 return;
564 }
565
566 set_fencing_completed(op);
567 undo_op_remap(op);
568
569 if (data == NULL) {
570 data = pcmk__xe_create(NULL, "remote-op");
571 local_data = data;
572
573 } else if (op->delegate == NULL) {
574 switch (op->result.execution_status) {
575 case PCMK_EXEC_NO_FENCE_DEVICE:
576 break;
577
578 case PCMK_EXEC_INVALID:
579 if (op->result.exit_status != CRM_EX_EXPIRED) {
580 op->delegate = delegate_from_xml(data);
581 }
582 break;
583
584 default:
585 op->delegate = delegate_from_xml(data);
586 break;
587 }
588 }
589
590 if (dup || (crm_element_value(data, PCMK__XA_ST_OP_MERGED) != NULL)) {
591 op_merged = true;
592 }
593
594
595
596
597 subt = crm_element_value(data, PCMK__XA_SUBT);
598 if (!dup && !pcmk__str_eq(subt, PCMK__VALUE_BROADCAST, pcmk__str_none)) {
599
600 fenced_broadcast_op_result(op, op_merged);
601 pcmk__xml_free(local_data);
602 return;
603 }
604
605 if (pcmk__result_ok(&op->result) || dup
606 || !pcmk__str_eq(op->originator, fenced_get_local_node(),
607 pcmk__str_casei)) {
608 level = LOG_NOTICE;
609 }
610 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s (%s%s%s) "
611 QB_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
612 (op->target? op->target : ""),
613 (op->delegate? op->delegate : "unknown node"),
614 op->client_name, op->originator,
615 (op_merged? " (merged)" : ""),
616 crm_exit_str(op->result.exit_status),
617 pcmk_exec_status_str(op->result.execution_status),
618 ((op->result.exit_reason == NULL)? "" : ": "),
619 ((op->result.exit_reason == NULL)? "" : op->result.exit_reason),
620 op->id);
621
622 handle_local_reply_and_notify(op, data);
623
624 if (!dup) {
625 finalize_op_duplicates(op, data);
626 }
627
628
629
630
631 if (op->query_results) {
632 g_list_free_full(op->query_results, free_remote_query);
633 op->query_results = NULL;
634 }
635 if (op->request) {
636 pcmk__xml_free(op->request);
637 op->request = NULL;
638 }
639
640 pcmk__xml_free(local_data);
641 }
642
643
644
645
646
647
648
649
650
651 static gboolean
652 remote_op_watchdog_done(gpointer userdata)
653 {
654 remote_fencing_op_t *op = userdata;
655
656 op->op_timer_one = 0;
657
658 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
659 QB_XS " id=%.8s",
660 op->action, op->target, op->client_name, op->id);
661 op->state = st_done;
662 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
663 finalize_op(op, NULL, false);
664 return G_SOURCE_REMOVE;
665 }
666
667 static gboolean
668 remote_op_timeout_one(gpointer userdata)
669 {
670 remote_fencing_op_t *op = userdata;
671
672 op->op_timer_one = 0;
673
674 crm_notice("Peer's '%s' action targeting %s for client %s timed out " QB_XS
675 " id=%.8s", op->action, op->target, op->client_name, op->id);
676 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT,
677 "Peer did not return fence result within timeout");
678
679
680 if (op->client_delay > 0) {
681 op->client_delay = 0;
682 crm_trace("Try another device for '%s' action targeting %s "
683 "for client %s without delay " QB_XS " id=%.8s",
684 op->action, op->target, op->client_name, op->id);
685 }
686
687
688 request_peer_fencing(op, NULL);
689 return G_SOURCE_REMOVE;
690 }
691
692
693
694
695
696
697
698
699 static void
700 finalize_timed_out_op(remote_fencing_op_t *op, const char *reason)
701 {
702 crm_debug("Action '%s' targeting %s for client %s timed out "
703 QB_XS " id=%.8s",
704 op->action, op->target, op->client_name, op->id);
705
706 if (op->phase == st_phase_on) {
707
708
709
710
711 op->state = st_done;
712 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
713 } else {
714 op->state = st_failed;
715 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_TIMEOUT, reason);
716 }
717 finalize_op(op, NULL, false);
718 }
719
720
721
722
723
724
725
726
727
728 static gboolean
729 remote_op_timeout(gpointer userdata)
730 {
731 remote_fencing_op_t *op = userdata;
732
733 op->op_timer_total = 0;
734
735 if (op->state == st_done) {
736 crm_debug("Action '%s' targeting %s for client %s already completed "
737 QB_XS " id=%.8s",
738 op->action, op->target, op->client_name, op->id);
739 } else {
740 finalize_timed_out_op(userdata, "Fencing did not complete within a "
741 "total timeout based on the "
742 "configured timeout and retries for "
743 "any devices attempted");
744 }
745 return G_SOURCE_REMOVE;
746 }
747
748 static gboolean
749 remote_op_query_timeout(gpointer data)
750 {
751 remote_fencing_op_t *op = data;
752
753 op->query_timer = 0;
754
755 if (op->state == st_done) {
756 crm_debug("Operation %.8s targeting %s already completed",
757 op->id, op->target);
758 } else if (op->state == st_exec) {
759 crm_debug("Operation %.8s targeting %s already in progress",
760 op->id, op->target);
761 } else if (op->query_results) {
762
763 crm_debug("Query %.8s targeting %s complete (state=%s)",
764 op->id, op->target, stonith_op_state_str(op->state));
765 request_peer_fencing(op, NULL);
766 } else {
767 crm_debug("Query %.8s targeting %s timed out (state=%s)",
768 op->id, op->target, stonith_op_state_str(op->state));
769 finalize_timed_out_op(op, "No capable peers replied to device query "
770 "within timeout");
771 }
772
773 return G_SOURCE_REMOVE;
774 }
775
776 static gboolean
777 topology_is_empty(stonith_topology_t *tp)
778 {
779 int i;
780
781 if (tp == NULL) {
782 return TRUE;
783 }
784
785 for (i = 0; i < ST__LEVEL_COUNT; i++) {
786 if (tp->levels[i] != NULL) {
787 return FALSE;
788 }
789 }
790 return TRUE;
791 }
792
793
794
795
796
797
798
799
800 static void
801 add_required_device(remote_fencing_op_t *op, const char *device)
802 {
803 GList *match = g_list_find_custom(op->automatic_list, device,
804 sort_strings);
805
806 if (!match) {
807 op->automatic_list = g_list_prepend(op->automatic_list,
808 pcmk__str_copy(device));
809 }
810 }
811
812
813
814
815
816
817
818
819 static void
820 remove_required_device(remote_fencing_op_t *op, const char *device)
821 {
822 GList *match = g_list_find_custom(op->automatic_list, device,
823 sort_strings);
824
825 if (match) {
826 op->automatic_list = g_list_remove(op->automatic_list, match->data);
827 }
828 }
829
830
831 static void
832 set_op_device_list(remote_fencing_op_t * op, GList *devices)
833 {
834 GList *lpc = NULL;
835
836 if (op->devices_list) {
837 g_list_free_full(op->devices_list, free);
838 op->devices_list = NULL;
839 }
840 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
841 const char *device = lpc->data;
842
843 op->devices_list = g_list_append(op->devices_list,
844 pcmk__str_copy(device));
845 }
846 op->devices = op->devices_list;
847 }
848
849
850
851
852
853
854
855
856
857
858 static gboolean
859 topology_matches(const stonith_topology_t *tp, const char *node)
860 {
861 regex_t r_patt;
862
863 CRM_CHECK(node && tp && tp->target, return FALSE);
864 switch (tp->kind) {
865 case fenced_target_by_attribute:
866
867
868
869
870
871
872 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
873 crm_notice("Matched %s with %s by attribute", node, tp->target);
874 return TRUE;
875 }
876 break;
877
878 case fenced_target_by_pattern:
879
880
881
882 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
883 crm_info("Bad regex '%s' for fencing level", tp->target);
884 } else {
885 int status = regexec(&r_patt, node, 0, NULL, 0);
886
887 regfree(&r_patt);
888 if (status == 0) {
889 crm_notice("Matched %s with %s by name", node, tp->target);
890 return TRUE;
891 }
892 }
893 break;
894
895 case fenced_target_by_name:
896 crm_trace("Testing %s against %s", node, tp->target);
897 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
898
899 default:
900 break;
901 }
902 crm_trace("No match for %s with %s", node, tp->target);
903 return FALSE;
904 }
905
906 stonith_topology_t *
907 find_topology_for_host(const char *host)
908 {
909 GHashTableIter tIter;
910 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
911
912 if(tp != NULL) {
913 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
914 return tp;
915 }
916
917 g_hash_table_iter_init(&tIter, topology);
918 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
919 if (topology_matches(tp, host)) {
920 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
921 return tp;
922 }
923 }
924
925 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
926 return NULL;
927 }
928
929
930
931
932
933
934
935
936
937
938
939
940 static int
941 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
942 {
943 stonith_topology_t *tp = NULL;
944
945 if (op->target) {
946 tp = find_topology_for_host(op->target);
947 }
948 if (topology_is_empty(tp)) {
949 return empty_ok? pcmk_rc_ok : ENODEV;
950 }
951
952 pcmk__assert(tp->levels != NULL);
953
954 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
955
956
957 undo_op_remap(op);
958
959 do {
960 op->level++;
961
962 } while (op->level < ST__LEVEL_COUNT && tp->levels[op->level] == NULL);
963
964 if (op->level < ST__LEVEL_COUNT) {
965 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
966 "for client %s@%s (id=%.8s)",
967 op->level, op->target, g_list_length(tp->levels[op->level]),
968 op->client_name, op->originator, op->id);
969 set_op_device_list(op, tp->levels[op->level]);
970
971
972 if ((op->level > 1) && (op->client_delay > 0)) {
973 op->client_delay = 0;
974 }
975
976 if ((g_list_next(op->devices_list) != NULL)
977 && pcmk__str_eq(op->action, PCMK_ACTION_REBOOT, pcmk__str_none)) {
978
979
980
981
982
983 op_phase_off(op);
984 }
985 return pcmk_rc_ok;
986 }
987
988 crm_info("All %sfencing options targeting %s for client %s@%s failed "
989 QB_XS " id=%.8s",
990 (stonith_watchdog_timeout_ms > 0)?"non-watchdog ":"",
991 op->target, op->client_name, op->originator, op->id);
992 return ENODEV;
993 }
994
995
996
997
998
999
1000
1001 static void
1002 merge_duplicates(remote_fencing_op_t *op)
1003 {
1004 GHashTableIter iter;
1005 remote_fencing_op_t *other = NULL;
1006
1007 time_t now = time(NULL);
1008
1009 g_hash_table_iter_init(&iter, stonith_remote_op_list);
1010 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
1011 const char *other_action = op_requested_action(other);
1012 pcmk__node_status_t *node = NULL;
1013
1014 if (!strcmp(op->id, other->id)) {
1015 continue;
1016 }
1017 if (other->state > st_exec) {
1018 crm_trace("%.8s not duplicate of %.8s: not in progress",
1019 op->id, other->id);
1020 continue;
1021 }
1022 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
1023 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
1024 op->id, other->id, op->target, other->target);
1025 continue;
1026 }
1027 if (!pcmk__str_eq(op->action, other_action, pcmk__str_none)) {
1028 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
1029 op->id, other->id, op->action, other_action);
1030 continue;
1031 }
1032 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
1033 crm_trace("%.8s not duplicate of %.8s: same client %s",
1034 op->id, other->id, op->client_name);
1035 continue;
1036 }
1037 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
1038 crm_trace("%.8s not duplicate of %.8s: self-fencing for %s",
1039 op->id, other->id, other->target);
1040 continue;
1041 }
1042
1043 node = pcmk__get_node(0, other->originator, NULL,
1044 pcmk__node_search_cluster_member);
1045
1046 if (!fencing_peer_active(node)) {
1047 crm_notice("Failing action '%s' targeting %s originating from "
1048 "client %s@%s: Originator is dead " QB_XS " id=%.8s",
1049 other->action, other->target, other->client_name,
1050 other->originator, other->id);
1051 crm_trace("%.8s not duplicate of %.8s: originator dead",
1052 op->id, other->id);
1053 other->state = st_failed;
1054 continue;
1055 }
1056 if ((other->total_timeout > 0)
1057 && (now > (other->total_timeout + other->created))) {
1058 crm_trace("%.8s not duplicate of %.8s: old (%lld vs. %lld + %ds)",
1059 op->id, other->id, (long long)now, (long long)other->created,
1060 other->total_timeout);
1061 continue;
1062 }
1063
1064
1065
1066
1067 other->duplicates = g_list_append(other->duplicates, op);
1068 if (other->total_timeout == 0) {
1069 other->total_timeout = op->total_timeout =
1070 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
1071 crm_trace("Best guess as to timeout used for %.8s: %ds",
1072 other->id, other->total_timeout);
1073 }
1074 crm_notice("Merging fencing action '%s' targeting %s originating from "
1075 "client %s with identical request from %s@%s "
1076 QB_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
1077 op->action, op->target, op->client_name,
1078 other->client_name, other->originator,
1079 op->id, other->id, other->total_timeout);
1080 report_timeout_period(op, other->total_timeout);
1081 op->state = st_duplicate;
1082 }
1083 }
1084
1085 static uint32_t fencing_active_peers(void)
1086 {
1087 uint32_t count = 0;
1088 pcmk__node_status_t *entry = NULL;
1089 GHashTableIter gIter;
1090
1091 g_hash_table_iter_init(&gIter, pcmk__peer_cache);
1092 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
1093 if(fencing_peer_active(entry)) {
1094 count++;
1095 }
1096 }
1097 return count;
1098 }
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109 int
1110 fenced_handle_manual_confirmation(const pcmk__client_t *client, xmlNode *msg)
1111 {
1112 remote_fencing_op_t *op = NULL;
1113 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, msg, LOG_ERR);
1114
1115 CRM_CHECK(dev != NULL, return EPROTO);
1116
1117 crm_notice("Received manual confirmation that %s has been fenced",
1118 pcmk__s(crm_element_value(dev, PCMK__XA_ST_TARGET),
1119 "unknown target"));
1120 op = initiate_remote_stonith_op(client, msg, TRUE);
1121 if (op == NULL) {
1122 return EPROTO;
1123 }
1124 op->state = st_done;
1125 set_fencing_completed(op);
1126 op->delegate = pcmk__str_copy("a human");
1127
1128
1129 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1130 finalize_op(op, msg, false);
1131
1132
1133
1134
1135 return EINPROGRESS;
1136 }
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149 void *
1150 create_remote_stonith_op(const char *client, xmlNode *request, gboolean peer)
1151 {
1152 remote_fencing_op_t *op = NULL;
1153 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_TARGET, request,
1154 LOG_NEVER);
1155 int rc = pcmk_rc_ok;
1156 const char *operation = NULL;
1157
1158 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1159
1160
1161
1162 if (peer && dev) {
1163 const char *op_id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
1164
1165 CRM_CHECK(op_id != NULL, return NULL);
1166
1167 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1168 if (op) {
1169 crm_debug("Reusing existing remote fencing op %.8s for %s",
1170 op_id, ((client == NULL)? "unknown client" : client));
1171 return op;
1172 }
1173 }
1174
1175 op = pcmk__assert_alloc(1, sizeof(remote_fencing_op_t));
1176
1177 crm_element_value_int(request, PCMK__XA_ST_TIMEOUT, &(op->base_timeout));
1178
1179 crm_element_value_int(request, PCMK__XA_ST_DELAY, &(op->client_delay));
1180
1181 if (peer && dev) {
1182 op->id = crm_element_value_copy(dev, PCMK__XA_ST_REMOTE_OP);
1183 } else {
1184 op->id = crm_generate_uuid();
1185 }
1186
1187 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1188
1189 op->state = st_query;
1190 op->replies_expected = fencing_active_peers();
1191 op->action = crm_element_value_copy(dev, PCMK__XA_ST_DEVICE_ACTION);
1192
1193
1194
1195
1196
1197
1198
1199 op->originator = crm_element_value_copy(dev, PCMK__XA_ST_ORIGIN);
1200 if (op->originator == NULL) {
1201
1202 op->originator = pcmk__str_copy(fenced_get_local_node());
1203 }
1204
1205
1206 op->delegate = crm_element_value_copy(dev, PCMK__XA_ST_DELEGATE);
1207 op->created = time(NULL);
1208
1209 CRM_LOG_ASSERT(client != NULL);
1210 op->client_id = pcmk__str_copy(client);
1211
1212
1213 operation = crm_element_value(request, PCMK__XA_ST_OP);
1214
1215 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1216 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1217 (unsigned long) getpid());
1218 } else {
1219 op->client_name = crm_element_value_copy(request,
1220 PCMK__XA_ST_CLIENTNAME);
1221 }
1222
1223 op->target = crm_element_value_copy(dev, PCMK__XA_ST_TARGET);
1224
1225
1226 op->request = pcmk__xml_copy(NULL, request);
1227
1228 rc = pcmk__xe_get_flags(request, PCMK__XA_ST_CALLOPT, &(op->call_options),
1229 0U);
1230 if (rc != pcmk_rc_ok) {
1231 crm_warn("Couldn't parse options from request %s: %s",
1232 op->id, pcmk_rc_str(rc));
1233 }
1234
1235 crm_element_value_int(request, PCMK__XA_ST_CALLID, &(op->client_callid));
1236
1237 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1238 "base timeout %ds, %u %s expected)",
1239 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1240 op->target, op->client_name, op->base_timeout,
1241 op->replies_expected,
1242 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1243
1244 if (op->call_options & st_opt_cs_nodeid) {
1245 int nodeid;
1246 pcmk__node_status_t *node = NULL;
1247
1248 pcmk__scan_min_int(op->target, &nodeid, 0);
1249 node = pcmk__search_node_caches(nodeid, NULL,
1250 pcmk__node_search_any
1251 |pcmk__node_search_cluster_cib);
1252
1253
1254 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1255
1256 if ((node != NULL) && (node->name != NULL)) {
1257 pcmk__str_update(&(op->target), node->name);
1258
1259 } else {
1260 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1261 }
1262 }
1263
1264
1265 merge_duplicates(op);
1266
1267 if (op->state != st_duplicate) {
1268
1269 fenced_send_notification(PCMK__VALUE_ST_NOTIFY_HISTORY, NULL, NULL);
1270 }
1271
1272
1273 stonith_fence_history_trim();
1274
1275 return op;
1276 }
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288 remote_fencing_op_t *
1289 initiate_remote_stonith_op(const pcmk__client_t *client, xmlNode *request,
1290 gboolean manual_ack)
1291 {
1292 int query_timeout = 0;
1293 xmlNode *query = NULL;
1294 const char *client_id = NULL;
1295 remote_fencing_op_t *op = NULL;
1296 const char *relay_op_id = NULL;
1297 const char *operation = NULL;
1298
1299 if (client) {
1300 client_id = client->id;
1301 } else {
1302 client_id = crm_element_value(request, PCMK__XA_ST_CLIENTID);
1303 }
1304
1305 CRM_LOG_ASSERT(client_id != NULL);
1306 op = create_remote_stonith_op(client_id, request, FALSE);
1307 op->owner = TRUE;
1308 if (manual_ack) {
1309 return op;
1310 }
1311
1312 CRM_CHECK(op->action, return NULL);
1313
1314 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1315 op->state = st_failed;
1316 }
1317
1318 switch (op->state) {
1319 case st_failed:
1320
1321 pcmk__set_result(&op->result, CRM_EX_ERROR, PCMK_EXEC_ERROR,
1322 "All topology levels failed");
1323 crm_warn("Could not request peer fencing (%s) targeting %s "
1324 QB_XS " id=%.8s", op->action, op->target, op->id);
1325 finalize_op(op, NULL, false);
1326 return op;
1327
1328 case st_duplicate:
1329 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1330 QB_XS " id=%.8s", op->action, op->target, op->id);
1331 return op;
1332
1333 default:
1334 crm_notice("Requesting peer fencing (%s) targeting %s "
1335 QB_XS " id=%.8s state=%s base_timeout=%ds",
1336 op->action, op->target, op->id,
1337 stonith_op_state_str(op->state), op->base_timeout);
1338 }
1339
1340 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1341 NULL, op->call_options);
1342
1343 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP, op->id);
1344 crm_xml_add(query, PCMK__XA_ST_TARGET, op->target);
1345 crm_xml_add(query, PCMK__XA_ST_DEVICE_ACTION, op_requested_action(op));
1346 crm_xml_add(query, PCMK__XA_ST_ORIGIN, op->originator);
1347 crm_xml_add(query, PCMK__XA_ST_CLIENTID, op->client_id);
1348 crm_xml_add(query, PCMK__XA_ST_CLIENTNAME, op->client_name);
1349 crm_xml_add_int(query, PCMK__XA_ST_TIMEOUT, op->base_timeout);
1350
1351
1352 operation = crm_element_value(request, PCMK__XA_ST_OP);
1353 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1354 relay_op_id = crm_element_value(request, PCMK__XA_ST_REMOTE_OP);
1355 if (relay_op_id) {
1356 crm_xml_add(query, PCMK__XA_ST_REMOTE_OP_RELAY, relay_op_id);
1357 }
1358 }
1359
1360 pcmk__cluster_send_message(NULL, pcmk_ipc_fenced, query);
1361 pcmk__xml_free(query);
1362
1363 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1364 op->query_timer = pcmk__create_timer((1000 * query_timeout), remote_op_query_timeout, op);
1365
1366 return op;
1367 }
1368
1369 enum find_best_peer_options {
1370
1371 FIND_PEER_SKIP_TARGET = 0x0001,
1372
1373 FIND_PEER_TARGET_ONLY = 0x0002,
1374
1375 FIND_PEER_VERIFIED_ONLY = 0x0004,
1376 };
1377
1378 static bool
1379 is_watchdog_fencing(const remote_fencing_op_t *op, const char *device)
1380 {
1381 return (stonith_watchdog_timeout_ms > 0
1382
1383 && pcmk__str_eq(device, STONITH_WATCHDOG_ID, pcmk__str_null_matches)
1384 && pcmk__is_fencing_action(op->action)
1385 && node_does_watchdog_fencing(op->target));
1386 }
1387
1388 static peer_device_info_t *
1389 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1390 {
1391 GList *iter = NULL;
1392 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1393
1394 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1395 return NULL;
1396 }
1397
1398 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1399 peer_device_info_t *peer = iter->data;
1400
1401 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1402 peer->host, op->target, peer->ndevices,
1403 pcmk__plural_s(peer->ndevices), peer->tried, options);
1404 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1405 continue;
1406 }
1407 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1408 continue;
1409 }
1410
1411 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1412
1413 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1414 return peer;
1415 }
1416
1417 } else if (!peer->tried
1418 && count_peer_devices(op, peer, verified_devices_only,
1419 fenced_support_flag(op->action))) {
1420
1421 crm_trace("Simple fencing");
1422 return peer;
1423 }
1424 }
1425
1426 return NULL;
1427 }
1428
1429 static peer_device_info_t *
1430 stonith_choose_peer(remote_fencing_op_t * op)
1431 {
1432 const char *device = NULL;
1433 peer_device_info_t *peer = NULL;
1434 uint32_t active = fencing_active_peers();
1435
1436 do {
1437 if (op->devices) {
1438 device = op->devices->data;
1439 crm_trace("Checking for someone to fence (%s) %s using %s",
1440 op->action, op->target, device);
1441 } else {
1442 crm_trace("Checking for someone to fence (%s) %s",
1443 op->action, op->target);
1444 }
1445
1446
1447 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1448 if (peer) {
1449 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1450 return peer;
1451 }
1452
1453 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1454 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1455 return NULL;
1456 }
1457
1458
1459 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1460 if (peer) {
1461 crm_trace("Found best unverified peer %s", peer->host);
1462 return peer;
1463 }
1464
1465
1466
1467
1468 if (op->phase != st_phase_on) {
1469 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1470 if (peer) {
1471 crm_trace("%s will fence itself", peer->host);
1472 return peer;
1473 }
1474 }
1475
1476
1477
1478
1479 } while ((op->phase != st_phase_on)
1480 && pcmk_is_set(op->call_options, st_opt_topology)
1481 && (advance_topology_level(op, false) == pcmk_rc_ok));
1482
1483
1484
1485
1486 if (is_watchdog_fencing(op, device)) {
1487 crm_info("Couldn't contact watchdog-fencing target-node (%s)",
1488 op->target);
1489
1490 } else {
1491 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1492 op->action, op->target, (device? device : "any device"));
1493 }
1494 return NULL;
1495 }
1496
1497 static int
1498 valid_fencing_timeout(int specified_timeout, bool action_specific,
1499 const remote_fencing_op_t *op, const char *device)
1500 {
1501 int timeout = specified_timeout;
1502
1503 if (!is_watchdog_fencing(op, device)) {
1504 return timeout;
1505 }
1506
1507 timeout = (int) QB_MIN(QB_MAX(specified_timeout,
1508 pcmk__timeout_ms2s(stonith_watchdog_timeout_ms)),
1509 INT_MAX);
1510
1511 if (timeout > specified_timeout) {
1512 if (action_specific) {
1513 crm_warn("pcmk_%s_timeout %ds for %s is too short (must be >= "
1514 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1515 "instead",
1516 op->action, specified_timeout, device? device : "watchdog",
1517 timeout, timeout);
1518
1519 } else {
1520 crm_warn("Fencing timeout %ds is too short (must be >= "
1521 PCMK_OPT_STONITH_WATCHDOG_TIMEOUT " %ds), using %ds "
1522 "instead",
1523 specified_timeout, timeout, timeout);
1524 }
1525 }
1526
1527 return timeout;
1528 }
1529
1530 static int
1531 get_device_timeout(const remote_fencing_op_t *op,
1532 const peer_device_info_t *peer, const char *device,
1533 bool with_delay)
1534 {
1535 int timeout = op->base_timeout;
1536 device_properties_t *props;
1537
1538 timeout = valid_fencing_timeout(op->base_timeout, false, op, device);
1539
1540 if (!peer || !device) {
1541 return timeout;
1542 }
1543
1544 props = g_hash_table_lookup(peer->devices, device);
1545 if (!props) {
1546 return timeout;
1547 }
1548
1549 if (props->custom_action_timeout[op->phase]) {
1550 timeout = valid_fencing_timeout(props->custom_action_timeout[op->phase],
1551 true, op, device);
1552 }
1553
1554
1555 if (with_delay && (op->client_delay >= 0)) {
1556
1557 timeout += (props->delay_max[op->phase] > 0 ?
1558 props->delay_max[op->phase] : props->delay_base[op->phase]);
1559 }
1560
1561 return timeout;
1562 }
1563
1564 struct timeout_data {
1565 const remote_fencing_op_t *op;
1566 const peer_device_info_t *peer;
1567 int total_timeout;
1568 };
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578 static void
1579 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1580 {
1581 const char *device_id = key;
1582 device_properties_t *props = value;
1583 struct timeout_data *timeout = user_data;
1584
1585 if (!props->executed[timeout->op->phase]
1586 && !props->disallowed[timeout->op->phase]) {
1587 timeout->total_timeout += get_device_timeout(timeout->op, timeout->peer,
1588 device_id, true);
1589 }
1590 }
1591
1592 static int
1593 get_peer_timeout(const remote_fencing_op_t *op, const peer_device_info_t *peer)
1594 {
1595 struct timeout_data timeout;
1596
1597 timeout.op = op;
1598 timeout.peer = peer;
1599 timeout.total_timeout = 0;
1600
1601 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1602
1603 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1604 }
1605
1606 static int
1607 get_op_total_timeout(const remote_fencing_op_t *op,
1608 const peer_device_info_t *chosen_peer)
1609 {
1610 long long total_timeout = 0;
1611 stonith_topology_t *tp = find_topology_for_host(op->target);
1612
1613 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1614 int i;
1615 GList *device_list = NULL;
1616 GList *iter = NULL;
1617 GList *auto_list = NULL;
1618
1619 if (pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)
1620 && (op->automatic_list != NULL)) {
1621 auto_list = g_list_copy(op->automatic_list);
1622 }
1623
1624
1625
1626
1627
1628
1629
1630
1631 for (i = 0; i < ST__LEVEL_COUNT; i++) {
1632 if (!tp->levels[i]) {
1633 continue;
1634 }
1635 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1636 bool found = false;
1637
1638 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1639 const peer_device_info_t *peer = iter->data;
1640
1641 if (auto_list) {
1642 GList *match = g_list_find_custom(auto_list, device_list->data,
1643 sort_strings);
1644 if (match) {
1645 auto_list = g_list_remove(auto_list, match->data);
1646 }
1647 }
1648
1649 if (find_peer_device(op, peer, device_list->data,
1650 fenced_support_flag(op->action))) {
1651 total_timeout += get_device_timeout(op, peer,
1652 device_list->data,
1653 true);
1654 found = true;
1655 break;
1656 }
1657 }
1658
1659
1660
1661
1662 if (!found && is_watchdog_fencing(op, device_list->data)) {
1663 total_timeout += pcmk__timeout_ms2s(stonith_watchdog_timeout_ms);
1664 }
1665 }
1666 }
1667
1668
1669 if (auto_list) {
1670 for (iter = auto_list; iter != NULL; iter = iter->next) {
1671 GList *iter2 = NULL;
1672
1673 for (iter2 = op->query_results; iter2 != NULL; iter = iter2->next) {
1674 peer_device_info_t *peer = iter2->data;
1675 if (find_peer_device(op, peer, iter->data, st_device_supports_on)) {
1676 total_timeout += get_device_timeout(op, peer,
1677 iter->data, true);
1678 break;
1679 }
1680 }
1681 }
1682 }
1683
1684 g_list_free(auto_list);
1685
1686 } else if (chosen_peer) {
1687 total_timeout = get_peer_timeout(op, chosen_peer);
1688
1689 } else {
1690 total_timeout = valid_fencing_timeout(op->base_timeout, false, op,
1691 NULL);
1692 }
1693
1694 if (total_timeout <= 0) {
1695 total_timeout = op->base_timeout;
1696 }
1697
1698
1699
1700
1701 if (op->client_delay > 0) {
1702 total_timeout += op->client_delay;
1703 }
1704 return (int) QB_MIN(total_timeout, INT_MAX);
1705 }
1706
1707 static void
1708 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1709 {
1710 GList *iter = NULL;
1711 xmlNode *update = NULL;
1712 const char *client_node = NULL;
1713 const char *client_id = NULL;
1714 const char *call_id = NULL;
1715
1716 if (op->call_options & st_opt_sync_call) {
1717
1718
1719
1720
1721 return;
1722 } else if (!op->request) {
1723 return;
1724 }
1725
1726 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1727 client_node = crm_element_value(op->request, PCMK__XA_ST_CLIENTNODE);
1728 call_id = crm_element_value(op->request, PCMK__XA_ST_CALLID);
1729 client_id = crm_element_value(op->request, PCMK__XA_ST_CLIENTID);
1730 if (!client_node || !call_id || !client_id) {
1731 return;
1732 }
1733
1734 if (pcmk__str_eq(client_node, fenced_get_local_node(), pcmk__str_casei)) {
1735
1736 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1737 return;
1738 }
1739
1740
1741 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1742 crm_xml_add(update, PCMK__XA_ST_REMOTE_OP, op->id);
1743 crm_xml_add(update, PCMK__XA_ST_CLIENTID, client_id);
1744 crm_xml_add(update, PCMK__XA_ST_CALLID, call_id);
1745 crm_xml_add_int(update, PCMK__XA_ST_TIMEOUT, op_timeout);
1746
1747 pcmk__cluster_send_message(pcmk__get_node(0, client_node, NULL,
1748 pcmk__node_search_cluster_member),
1749 pcmk_ipc_fenced, update);
1750
1751 pcmk__xml_free(update);
1752
1753 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1754 remote_fencing_op_t *dup = iter->data;
1755
1756 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1757 dup->id, dup->client_name);
1758 report_timeout_period(iter->data, op_timeout);
1759 }
1760 }
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770 static void
1771 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1772 xmlNode *msg)
1773 {
1774
1775 if (op->devices) {
1776 op->devices = op->devices->next;
1777 }
1778
1779
1780 if ((op->phase == st_phase_requested)
1781 && pcmk__str_eq(op->action, PCMK_ACTION_ON, pcmk__str_none)) {
1782
1783 remove_required_device(op, device);
1784
1785
1786
1787
1788 if (op->devices == NULL) {
1789 op->devices = op->automatic_list;
1790 }
1791 }
1792
1793 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1794
1795
1796
1797
1798 op_phase_on(op);
1799 }
1800
1801
1802 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
1803
1804 if (op->devices) {
1805
1806 crm_trace("Next targeting %s on behalf of %s@%s",
1807 op->target, op->client_name, op->originator);
1808
1809
1810 if (op->client_delay > 0) {
1811 op->client_delay = 0;
1812 }
1813
1814 request_peer_fencing(op, NULL);
1815 } else {
1816
1817 crm_trace("Marking complex fencing op targeting %s as complete",
1818 op->target);
1819 op->state = st_done;
1820 finalize_op(op, msg, false);
1821 }
1822 }
1823
1824 static gboolean
1825 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
1826 {
1827 if (node_does_watchdog_fencing(op->target)) {
1828 guint timeout_ms = QB_MIN(stonith_watchdog_timeout_ms, UINT_MAX);
1829
1830 crm_notice("Waiting %s for %s to self-fence (%s) for "
1831 "client %s " QB_XS " id=%.8s",
1832 pcmk__readable_interval(timeout_ms), op->target, op->action,
1833 op->client_name, op->id);
1834
1835 if (op->op_timer_one) {
1836 g_source_remove(op->op_timer_one);
1837 }
1838 op->op_timer_one = pcmk__create_timer(timeout_ms, remote_op_watchdog_done,
1839 op);
1840 return TRUE;
1841 } else {
1842 crm_debug("Skipping fallback to watchdog-fencing as %s is "
1843 "not in host-list", op->target);
1844 }
1845 return FALSE;
1846 }
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 static void
1857 request_peer_fencing(remote_fencing_op_t *op, peer_device_info_t *peer)
1858 {
1859 const char *device = NULL;
1860 int timeout;
1861
1862 CRM_CHECK(op != NULL, return);
1863
1864 crm_trace("Action %.8s targeting %s for %s is %s",
1865 op->id, op->target, op->client_name,
1866 stonith_op_state_str(op->state));
1867
1868 if ((op->phase == st_phase_on) && (op->devices != NULL)) {
1869
1870
1871
1872
1873
1874
1875
1876
1877
1878
1879
1880 device = op->devices->data;
1881 if (pcmk__str_eq(fenced_device_reboot_action(device), PCMK_ACTION_OFF,
1882 pcmk__str_none)) {
1883 crm_info("Not turning %s back on using %s because the device is "
1884 "configured to stay off (pcmk_reboot_action='off')",
1885 op->target, device);
1886 advance_topology_device_in_level(op, device, NULL);
1887 return;
1888 }
1889 if (!fenced_device_supports_on(device)) {
1890 crm_info("Not turning %s back on using %s because the agent "
1891 "doesn't support 'on'", op->target, device);
1892 advance_topology_device_in_level(op, device, NULL);
1893 return;
1894 }
1895 }
1896
1897 timeout = op->base_timeout;
1898 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1899 peer = stonith_choose_peer(op);
1900 }
1901
1902 if (!op->op_timer_total) {
1903 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, peer);
1904 op->op_timer_total = pcmk__create_timer(1000 * op->total_timeout, remote_op_timeout, op);
1905 report_timeout_period(op, op->total_timeout);
1906 crm_info("Total timeout set to %ds for peer's fencing targeting %s for %s "
1907 QB_XS " id=%.8s",
1908 op->total_timeout, op->target, op->client_name, op->id);
1909 }
1910
1911 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1912
1913
1914
1915
1916
1917
1918
1919
1920 peer = stonith_choose_peer(op);
1921
1922 device = op->devices->data;
1923
1924
1925
1926
1927 timeout = get_device_timeout(op, peer, device, false);
1928 }
1929
1930 if (peer) {
1931 int timeout_one = 0;
1932 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1933 const pcmk__node_status_t *peer_node =
1934 pcmk__get_node(0, peer->host, NULL,
1935 pcmk__node_search_cluster_member);
1936
1937 if (op->client_delay > 0) {
1938
1939
1940
1941 timeout_one = TIMEOUT_MULTIPLY_FACTOR * op->client_delay;
1942 }
1943
1944 crm_xml_add(remote_op, PCMK__XA_ST_REMOTE_OP, op->id);
1945 crm_xml_add(remote_op, PCMK__XA_ST_TARGET, op->target);
1946 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ACTION, op->action);
1947 crm_xml_add(remote_op, PCMK__XA_ST_ORIGIN, op->originator);
1948 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTID, op->client_id);
1949 crm_xml_add(remote_op, PCMK__XA_ST_CLIENTNAME, op->client_name);
1950 crm_xml_add_int(remote_op, PCMK__XA_ST_TIMEOUT, timeout);
1951 crm_xml_add_int(remote_op, PCMK__XA_ST_CALLOPT, op->call_options);
1952 crm_xml_add_int(remote_op, PCMK__XA_ST_DELAY, op->client_delay);
1953
1954 if (device) {
1955 timeout_one += TIMEOUT_MULTIPLY_FACTOR *
1956 get_device_timeout(op, peer, device, true);
1957 crm_notice("Requesting that %s perform '%s' action targeting %s "
1958 "using %s " QB_XS " for client %s (%ds)",
1959 peer->host, op->action, op->target, device,
1960 op->client_name, timeout_one);
1961 crm_xml_add(remote_op, PCMK__XA_ST_DEVICE_ID, device);
1962
1963 } else {
1964 timeout_one += TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1965 crm_notice("Requesting that %s perform '%s' action targeting %s "
1966 QB_XS " for client %s (%ds, %s)",
1967 peer->host, op->action, op->target, op->client_name,
1968 timeout_one,
1969 pcmk__readable_interval(stonith_watchdog_timeout_ms));
1970 }
1971
1972 op->state = st_exec;
1973 if (op->op_timer_one) {
1974 g_source_remove(op->op_timer_one);
1975 op->op_timer_one = 0;
1976 }
1977
1978 if (!is_watchdog_fencing(op, device)
1979 || !check_watchdog_fencing_and_wait(op)) {
1980
1981
1982
1983
1984
1985
1986
1987
1988
1989
1990
1991
1992
1993
1994
1995
1996
1997
1998
1999
2000
2001 op->op_timer_one = pcmk__create_timer((1000 * timeout_one), remote_op_timeout_one, op);
2002 }
2003
2004 pcmk__cluster_send_message(peer_node, pcmk_ipc_fenced, remote_op);
2005 peer->tried = TRUE;
2006 pcmk__xml_free(remote_op);
2007 return;
2008
2009 } else if (op->phase == st_phase_on) {
2010
2011
2012
2013 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
2014 "after successful 'off'", device, op->target);
2015 advance_topology_device_in_level(op, device, NULL);
2016 return;
2017
2018 } else if (op->owner == FALSE) {
2019 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
2020 op->action, op->target, op->client_name);
2021
2022 } else if (op->query_timer == 0) {
2023
2024 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
2025 QB_XS " state=%s", op->action, op->target, op->client_name,
2026 stonith_op_state_str(op->state));
2027 CRM_CHECK(op->state < st_done, return);
2028 finalize_timed_out_op(op, "All nodes failed, or are unable, to "
2029 "fence target");
2030
2031 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
2032
2033
2034
2035
2036 if (is_watchdog_fencing(op, device)
2037 && check_watchdog_fencing_and_wait(op)) {
2038
2039
2040
2041
2042
2043 op->state = st_exec;
2044 return;
2045 }
2046
2047 if (op->state == st_query) {
2048 crm_info("No peers (out of %d) have devices capable of fencing "
2049 "(%s) %s for client %s " QB_XS " state=%s",
2050 op->replies, op->action, op->target, op->client_name,
2051 stonith_op_state_str(op->state));
2052
2053 pcmk__reset_result(&op->result);
2054 pcmk__set_result(&op->result, CRM_EX_ERROR,
2055 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2056 } else {
2057 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2058 pcmk__reset_result(&op->result);
2059 pcmk__set_result(&op->result, CRM_EX_ERROR,
2060 PCMK_EXEC_NO_FENCE_DEVICE, NULL);
2061 }
2062
2063
2064
2065
2066
2067
2068
2069 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
2070 "for client %s " QB_XS " state=%s",
2071 op->replies, op->action, op->target, op->client_name,
2072 stonith_op_state_str(op->state));
2073 }
2074
2075 op->state = st_failed;
2076 finalize_op(op, NULL, false);
2077
2078 } else {
2079 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
2080 "for client %s " QB_XS " id=%.8s",
2081 op->action, op->target, (device? " using " : ""),
2082 (device? device : ""), op->client_name, op->id);
2083 }
2084 }
2085
2086
2087
2088
2089
2090
2091
2092
2093
2094
2095
2096
2097 static gint
2098 sort_peers(gconstpointer a, gconstpointer b)
2099 {
2100 const peer_device_info_t *peer_a = a;
2101 const peer_device_info_t *peer_b = b;
2102
2103 return (peer_b->ndevices - peer_a->ndevices);
2104 }
2105
2106
2107
2108
2109
2110
2111
2112 static gboolean
2113 all_topology_devices_found(const remote_fencing_op_t *op)
2114 {
2115 GList *device = NULL;
2116 GList *iter = NULL;
2117 device_properties_t *match = NULL;
2118 stonith_topology_t *tp = NULL;
2119 gboolean skip_target = FALSE;
2120 int i;
2121
2122 tp = find_topology_for_host(op->target);
2123 if (!tp) {
2124 return FALSE;
2125 }
2126 if (pcmk__is_fencing_action(op->action)) {
2127
2128
2129 skip_target = TRUE;
2130 }
2131
2132 for (i = 0; i < ST__LEVEL_COUNT; i++) {
2133 for (device = tp->levels[i]; device; device = device->next) {
2134 match = NULL;
2135 for (iter = op->query_results; iter && !match; iter = iter->next) {
2136 peer_device_info_t *peer = iter->data;
2137
2138 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
2139 continue;
2140 }
2141 match = find_peer_device(op, peer, device->data, st_device_supports_none);
2142 }
2143 if (!match) {
2144 return FALSE;
2145 }
2146 }
2147 }
2148
2149 return TRUE;
2150 }
2151
2152
2153
2154
2155
2156
2157
2158
2159
2160
2161
2162
2163
2164 static void
2165 parse_action_specific(const xmlNode *xml, const char *peer, const char *device,
2166 const char *action, remote_fencing_op_t *op,
2167 enum st_remap_phase phase, device_properties_t *props)
2168 {
2169 props->custom_action_timeout[phase] = 0;
2170 crm_element_value_int(xml, PCMK__XA_ST_ACTION_TIMEOUT,
2171 &props->custom_action_timeout[phase]);
2172 if (props->custom_action_timeout[phase]) {
2173 crm_trace("Peer %s with device %s returned %s action timeout %ds",
2174 peer, device, action, props->custom_action_timeout[phase]);
2175 }
2176
2177 props->delay_max[phase] = 0;
2178 crm_element_value_int(xml, PCMK__XA_ST_DELAY_MAX, &props->delay_max[phase]);
2179 if (props->delay_max[phase]) {
2180 crm_trace("Peer %s with device %s returned maximum of random delay %ds for %s",
2181 peer, device, props->delay_max[phase], action);
2182 }
2183
2184 props->delay_base[phase] = 0;
2185 crm_element_value_int(xml, PCMK__XA_ST_DELAY_BASE,
2186 &props->delay_base[phase]);
2187 if (props->delay_base[phase]) {
2188 crm_trace("Peer %s with device %s returned base delay %ds for %s",
2189 peer, device, props->delay_base[phase], action);
2190 }
2191
2192
2193 if (pcmk__str_eq(action, PCMK_ACTION_ON, pcmk__str_none)) {
2194 int required = 0;
2195
2196 crm_element_value_int(xml, PCMK__XA_ST_REQUIRED, &required);
2197 if (required) {
2198 crm_trace("Peer %s requires device %s to execute for action %s",
2199 peer, device, action);
2200 add_required_device(op, device);
2201 }
2202 }
2203
2204
2205
2206
2207 if (pcmk__xe_attr_is_true(xml, PCMK__XA_ST_ACTION_DISALLOWED)) {
2208 props->disallowed[phase] = TRUE;
2209 crm_trace("Peer %s is disallowed from executing %s for device %s",
2210 peer, action, device);
2211 }
2212 }
2213
2214
2215
2216
2217
2218
2219
2220
2221
2222
2223 static void
2224 add_device_properties(const xmlNode *xml, remote_fencing_op_t *op,
2225 peer_device_info_t *peer, const char *device)
2226 {
2227 xmlNode *child;
2228 int verified = 0;
2229 device_properties_t *props =
2230 pcmk__assert_alloc(1, sizeof(device_properties_t));
2231 int rc = pcmk_rc_ok;
2232
2233
2234 g_hash_table_insert(peer->devices, pcmk__str_copy(device), props);
2235
2236
2237 crm_element_value_int(xml, PCMK__XA_ST_MONITOR_VERIFIED, &verified);
2238 if (verified) {
2239 crm_trace("Peer %s has confirmed a verified device %s",
2240 peer->host, device);
2241 props->verified = TRUE;
2242 }
2243
2244
2245 rc = pcmk__xe_get_flags(xml, PCMK__XA_ST_DEVICE_SUPPORT_FLAGS,
2246 &(props->device_support_flags),
2247 st_device_supports_on);
2248 if (rc != pcmk_rc_ok) {
2249 crm_warn("Couldn't determine device support for %s "
2250 "(assuming unfencing): %s", device, pcmk_rc_str(rc));
2251 }
2252
2253
2254 parse_action_specific(xml, peer->host, device, op_requested_action(op),
2255 op, st_phase_requested, props);
2256 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2257 child = pcmk__xe_next(child, NULL)) {
2258
2259
2260
2261
2262 if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_OFF, pcmk__str_none)) {
2263 parse_action_specific(child, peer->host, device, PCMK_ACTION_OFF,
2264 op, st_phase_off, props);
2265
2266 } else if (pcmk__str_eq(pcmk__xe_id(child), PCMK_ACTION_ON,
2267 pcmk__str_none)) {
2268 parse_action_specific(child, peer->host, device, PCMK_ACTION_ON,
2269 op, st_phase_on, props);
2270 }
2271 }
2272 }
2273
2274
2275
2276
2277
2278
2279
2280
2281
2282
2283
2284
2285 static peer_device_info_t *
2286 add_result(remote_fencing_op_t *op, const char *host, int ndevices,
2287 const xmlNode *xml)
2288 {
2289 peer_device_info_t *peer = pcmk__assert_alloc(1,
2290 sizeof(peer_device_info_t));
2291 xmlNode *child;
2292
2293 peer->host = pcmk__str_copy(host);
2294 peer->devices = pcmk__strkey_table(free, free);
2295
2296
2297 for (child = pcmk__xe_first_child(xml, NULL, NULL, NULL); child != NULL;
2298 child = pcmk__xe_next(child, NULL)) {
2299 const char *device = pcmk__xe_id(child);
2300
2301 if (device) {
2302 add_device_properties(child, op, peer, device);
2303 }
2304 }
2305
2306 peer->ndevices = g_hash_table_size(peer->devices);
2307 CRM_CHECK(ndevices == peer->ndevices,
2308 crm_err("Query claimed to have %d device%s but %d found",
2309 ndevices, pcmk__plural_s(ndevices), peer->ndevices));
2310
2311 op->query_results = g_list_insert_sorted(op->query_results, peer, sort_peers);
2312 return peer;
2313 }
2314
2315
2316
2317
2318
2319
2320
2321
2322
2323
2324
2325
2326
2327
2328
2329 int
2330 process_remote_stonith_query(xmlNode *msg)
2331 {
2332 int ndevices = 0;
2333 gboolean host_is_target = FALSE;
2334 gboolean have_all_replies = FALSE;
2335 const char *id = NULL;
2336 const char *host = NULL;
2337 remote_fencing_op_t *op = NULL;
2338 peer_device_info_t *peer = NULL;
2339 uint32_t replies_expected;
2340 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
2341
2342 CRM_CHECK(dev != NULL, return -EPROTO);
2343
2344 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2345 CRM_CHECK(id != NULL, return -EPROTO);
2346
2347 dev = get_xpath_object("//@" PCMK__XA_ST_AVAILABLE_DEVICES, msg, LOG_ERR);
2348 CRM_CHECK(dev != NULL, return -EPROTO);
2349 crm_element_value_int(dev, PCMK__XA_ST_AVAILABLE_DEVICES, &ndevices);
2350
2351 op = g_hash_table_lookup(stonith_remote_op_list, id);
2352 if (op == NULL) {
2353 crm_debug("Received query reply for unknown or expired operation %s",
2354 id);
2355 return -EOPNOTSUPP;
2356 }
2357
2358 replies_expected = fencing_active_peers();
2359 if (op->replies_expected < replies_expected) {
2360 replies_expected = op->replies_expected;
2361 }
2362 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
2363 have_all_replies = TRUE;
2364 }
2365 host = crm_element_value(msg, PCMK__XA_SRC);
2366 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
2367
2368 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
2369 op->replies, replies_expected, host,
2370 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
2371 if (ndevices > 0) {
2372 peer = add_result(op, host, ndevices, dev);
2373 }
2374
2375 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2376
2377 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2378
2379
2380
2381 if (op->state == st_query && all_topology_devices_found(op)) {
2382
2383 crm_trace("All topology devices found");
2384 request_peer_fencing(op, peer);
2385
2386 } else if (have_all_replies) {
2387 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2388 replies_expected, op->replies);
2389 request_peer_fencing(op, NULL);
2390 }
2391
2392 } else if (op->state == st_query) {
2393 int nverified = count_peer_devices(op, peer, TRUE,
2394 fenced_support_flag(op->action));
2395
2396
2397
2398 if ((peer != NULL) && !host_is_target && nverified) {
2399
2400 crm_trace("Found %d verified device%s",
2401 nverified, pcmk__plural_s(nverified));
2402 request_peer_fencing(op, peer);
2403
2404 } else if (have_all_replies) {
2405 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2406 replies_expected, op->replies);
2407 request_peer_fencing(op, NULL);
2408
2409 } else {
2410 crm_trace("Waiting for more peer results before launching fencing operation");
2411 }
2412
2413 } else if ((peer != NULL) && (op->state == st_done)) {
2414 crm_info("Discarding query result from %s (%d device%s): "
2415 "Operation is %s", peer->host,
2416 peer->ndevices, pcmk__plural_s(peer->ndevices),
2417 stonith_op_state_str(op->state));
2418 }
2419
2420 return pcmk_ok;
2421 }
2422
2423
2424
2425
2426
2427
2428
2429
2430
2431
2432 void
2433 fenced_process_fencing_reply(xmlNode *msg)
2434 {
2435 const char *id = NULL;
2436 const char *device = NULL;
2437 remote_fencing_op_t *op = NULL;
2438 xmlNode *dev = get_xpath_object("//@" PCMK__XA_ST_REMOTE_OP, msg, LOG_ERR);
2439 pcmk__action_result_t result = PCMK__UNKNOWN_RESULT;
2440
2441 CRM_CHECK(dev != NULL, return);
2442
2443 id = crm_element_value(dev, PCMK__XA_ST_REMOTE_OP);
2444 CRM_CHECK(id != NULL, return);
2445
2446 dev = stonith__find_xe_with_result(msg);
2447 CRM_CHECK(dev != NULL, return);
2448
2449 stonith__xe_get_result(dev, &result);
2450
2451 device = crm_element_value(dev, PCMK__XA_ST_DEVICE_ID);
2452
2453 if (stonith_remote_op_list) {
2454 op = g_hash_table_lookup(stonith_remote_op_list, id);
2455 }
2456
2457 if ((op == NULL) && pcmk__result_ok(&result)) {
2458
2459 const char *client_id = crm_element_value(dev, PCMK__XA_ST_CLIENTID);
2460
2461 op = create_remote_stonith_op(client_id, dev, TRUE);
2462 }
2463
2464 if (op == NULL) {
2465
2466
2467 crm_info("Received peer result of unknown or expired operation %s", id);
2468 pcmk__reset_result(&result);
2469 return;
2470 }
2471
2472 pcmk__reset_result(&op->result);
2473 op->result = result;
2474
2475 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2476 crm_err("Received outdated reply for device %s (instead of %s) to "
2477 "fence (%s) %s. Operation already timed out at peer level.",
2478 device, (const char *) op->devices->data, op->action, op->target);
2479 return;
2480 }
2481
2482 if (pcmk__str_eq(crm_element_value(msg, PCMK__XA_SUBT),
2483 PCMK__VALUE_BROADCAST, pcmk__str_none)) {
2484
2485 if (pcmk__result_ok(&op->result)) {
2486 op->state = st_done;
2487 } else {
2488 op->state = st_failed;
2489 }
2490 finalize_op(op, msg, false);
2491 return;
2492
2493 } else if (!pcmk__str_eq(op->originator, fenced_get_local_node(),
2494 pcmk__str_casei)) {
2495
2496
2497 crm_err("Received non-broadcast fencing result for operation %.8s "
2498 "we do not own (device %s targeting %s)",
2499 op->id, device, op->target);
2500 return;
2501 }
2502
2503 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2504 const char *device = NULL;
2505 const char *reason = op->result.exit_reason;
2506
2507
2508
2509 if (op->state == st_done) {
2510 finalize_op(op, msg, false);
2511 return;
2512 }
2513
2514 device = crm_element_value(msg, PCMK__XA_ST_DEVICE_ID);
2515
2516 if ((op->phase == 2) && !pcmk__result_ok(&op->result)) {
2517
2518
2519
2520 crm_warn("Ignoring %s 'on' failure (%s%s%s) targeting %s "
2521 "after successful 'off'",
2522 device, pcmk_exec_status_str(op->result.execution_status),
2523 (reason == NULL)? "" : ": ",
2524 (reason == NULL)? "" : reason,
2525 op->target);
2526 pcmk__set_result(&op->result, CRM_EX_OK, PCMK_EXEC_DONE, NULL);
2527 } else {
2528 crm_notice("Action '%s' targeting %s%s%s on behalf of %s@%s: "
2529 "%s%s%s%s",
2530 op->action, op->target,
2531 ((device == NULL)? "" : " using "),
2532 ((device == NULL)? "" : device),
2533 op->client_name,
2534 op->originator,
2535 pcmk_exec_status_str(op->result.execution_status),
2536 (reason == NULL)? "" : " (",
2537 (reason == NULL)? "" : reason,
2538 (reason == NULL)? "" : ")");
2539 }
2540
2541 if (pcmk__result_ok(&op->result)) {
2542
2543
2544 advance_topology_device_in_level(op, device, msg);
2545 return;
2546 } else {
2547
2548
2549 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2550 op->state = st_failed;
2551 finalize_op(op, msg, false);
2552 return;
2553 }
2554 }
2555
2556 } else if (pcmk__result_ok(&op->result) && (op->devices == NULL)) {
2557 op->state = st_done;
2558 finalize_op(op, msg, false);
2559 return;
2560
2561 } else if ((op->result.execution_status == PCMK_EXEC_TIMEOUT)
2562 && (op->devices == NULL)) {
2563
2564 op->state = st_failed;
2565 finalize_op(op, msg, false);
2566 return;
2567
2568 } else {
2569
2570 }
2571
2572
2573 crm_trace("Next for %s on behalf of %s@%s (result was: %s)",
2574 op->target, op->originator, op->client_name,
2575 pcmk_exec_status_str(op->result.execution_status));
2576 request_peer_fencing(op, NULL);
2577 }
2578
2579 gboolean
2580 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2581 {
2582 GHashTableIter iter;
2583 time_t now = time(NULL);
2584 remote_fencing_op_t *rop = NULL;
2585
2586 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2587 action == NULL) {
2588 return FALSE;
2589 }
2590
2591 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2592 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2593 if (strcmp(rop->target, target) != 0) {
2594 continue;
2595 } else if (rop->state != st_done) {
2596 continue;
2597
2598
2599
2600 } else if (strcmp(rop->action, action) != 0) {
2601 continue;
2602 } else if ((rop->completed + tolerance) < now) {
2603 continue;
2604 }
2605
2606 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2607 target, action, tolerance, rop->delegate, rop->originator);
2608 return TRUE;
2609 }
2610 return FALSE;
2611 }