This source file includes following definitions.
- sort_strings
- free_remote_query
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- create_op_done_notify
- bcast_result_to_peers
- handle_local_reply_and_notify
- handle_duplicates
- remote_op_done
- remote_op_watchdog_done
- remote_op_timeout_one
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- stonith_topology_next
- merge_duplicates
- fencing_active_peers
- stonith_manual_ack
- stonith_get_peer_name
- create_remote_stonith_op
- initiate_remote_stonith_op
- find_best_peer
- stonith_choose_peer
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_op_topology
- call_remote_stonith
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- process_remote_stonith_exec
- stonith_fence_history
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19 #include <crm_internal.h>
20
21 #include <sys/param.h>
22 #include <stdio.h>
23 #include <sys/types.h>
24 #include <sys/wait.h>
25 #include <sys/stat.h>
26 #include <unistd.h>
27 #include <sys/utsname.h>
28
29 #include <stdlib.h>
30 #include <errno.h>
31 #include <fcntl.h>
32 #include <ctype.h>
33 #include <regex.h>
34
35 #include <crm/crm.h>
36 #include <crm/msg_xml.h>
37 #include <crm/common/ipc.h>
38 #include <crm/common/ipcs.h>
39 #include <crm/cluster/internal.h>
40
41 #include <crm/stonith-ng.h>
42 #include <crm/fencing/internal.h>
43 #include <crm/common/xml.h>
44
45 #include <crm/common/util.h>
46 #include <internal.h>
47
48 #define TIMEOUT_MULTIPLY_FACTOR 1.2
49
50
51
52
53
54
55
56 typedef struct device_properties_s {
57
58 gboolean verified;
59
60
61
62
63 gboolean executed[st_phase_max];
64
65 gboolean disallowed[st_phase_max];
66
67 int custom_action_timeout[st_phase_max];
68
69 int delay_max[st_phase_max];
70
71 int delay_base[st_phase_max];
72 } device_properties_t;
73
74 typedef struct st_query_result_s {
75
76 char *host;
77
78 gboolean tried;
79
80 int ndevices;
81
82 GHashTable *devices;
83 } st_query_result_t;
84
85 GHashTable *remote_op_list = NULL;
86 void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer);
87 static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup);
88 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
89 int call_options);
90
91 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
92 static int get_op_total_timeout(const remote_fencing_op_t *op,
93 const st_query_result_t *chosen_peer);
94
95 static gint
96 sort_strings(gconstpointer a, gconstpointer b)
97 {
98 return strcmp(a, b);
99 }
100
101 static void
102 free_remote_query(gpointer data)
103 {
104 if (data) {
105 st_query_result_t *query = data;
106
107 crm_trace("Free'ing query result from %s", query->host);
108 g_hash_table_destroy(query->devices);
109 free(query->host);
110 free(query);
111 }
112 }
113
114 struct peer_count_data {
115 const remote_fencing_op_t *op;
116 gboolean verified_only;
117 int count;
118 };
119
120
121
122
123
124
125
126
127
128 static void
129 count_peer_device(gpointer key, gpointer value, gpointer user_data)
130 {
131 device_properties_t *props = (device_properties_t*)value;
132 struct peer_count_data *data = user_data;
133
134 if (!props->executed[data->op->phase]
135 && (!data->verified_only || props->verified)) {
136 ++(data->count);
137 }
138 }
139
140
141
142
143
144
145
146
147
148
149
150 static int
151 count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer,
152 gboolean verified_only)
153 {
154 struct peer_count_data data;
155
156 data.op = op;
157 data.verified_only = verified_only;
158 data.count = 0;
159 if (peer) {
160 g_hash_table_foreach(peer->devices, count_peer_device, &data);
161 }
162 return data.count;
163 }
164
165
166
167
168
169
170
171
172
173
174
175 static device_properties_t *
176 find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer,
177 const char *device)
178 {
179 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
180
181 return (props && !props->executed[op->phase]
182 && !props->disallowed[op->phase])? props : NULL;
183 }
184
185
186
187
188
189
190
191
192
193
194
195
196 static gboolean
197 grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer,
198 const char *device, gboolean verified_devices_only)
199 {
200 device_properties_t *props = find_peer_device(op, peer, device);
201
202 if ((props == NULL) || (verified_devices_only && !props->verified)) {
203 return FALSE;
204 }
205
206 crm_trace("Removing %s from %s (%d remaining)",
207 device, peer->host, count_peer_devices(op, peer, FALSE));
208 props->executed[op->phase] = TRUE;
209 return TRUE;
210 }
211
212 static void
213 clear_remote_op_timers(remote_fencing_op_t * op)
214 {
215 if (op->query_timer) {
216 g_source_remove(op->query_timer);
217 op->query_timer = 0;
218 }
219 if (op->op_timer_total) {
220 g_source_remove(op->op_timer_total);
221 op->op_timer_total = 0;
222 }
223 if (op->op_timer_one) {
224 g_source_remove(op->op_timer_one);
225 op->op_timer_one = 0;
226 }
227 }
228
229 static void
230 free_remote_op(gpointer data)
231 {
232 remote_fencing_op_t *op = data;
233
234 crm_trace("Free'ing op %s for %s", op->id, op->target);
235 crm_log_xml_debug(op->request, "Destroying");
236
237 clear_remote_op_timers(op);
238
239 free(op->id);
240 free(op->action);
241 free(op->target);
242 free(op->client_id);
243 free(op->client_name);
244 free(op->originator);
245
246 if (op->query_results) {
247 g_list_free_full(op->query_results, free_remote_query);
248 }
249 if (op->request) {
250 free_xml(op->request);
251 op->request = NULL;
252 }
253 if (op->devices_list) {
254 g_list_free_full(op->devices_list, free);
255 op->devices_list = NULL;
256 }
257 g_list_free_full(op->automatic_list, free);
258 free(op);
259 }
260
261
262
263
264
265
266
267
268
269 static const char *
270 op_requested_action(const remote_fencing_op_t *op)
271 {
272 return ((op->phase > st_phase_requested)? "reboot" : op->action);
273 }
274
275
276
277
278
279
280
281 static void
282 op_phase_off(remote_fencing_op_t *op)
283 {
284 crm_info("Remapping multiple-device reboot of %s (%s) to off",
285 op->target, op->id);
286 op->phase = st_phase_off;
287
288
289
290
291 strcpy(op->action, "off");
292 }
293
294
295
296
297
298
299
300 static void
301 op_phase_on(remote_fencing_op_t *op)
302 {
303 GListPtr iter = NULL;
304
305 crm_info("Remapped off of %s complete, remapping to on for %s.%.8s",
306 op->target, op->client_name, op->id);
307 op->phase = st_phase_on;
308 strcpy(op->action, "on");
309
310
311
312
313 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
314 GListPtr match = g_list_find_custom(op->devices_list, iter->data,
315 sort_strings);
316
317 if (match) {
318 op->devices_list = g_list_remove(op->devices_list, match->data);
319 }
320 }
321 g_list_free_full(op->automatic_list, free);
322 op->automatic_list = NULL;
323
324
325 op->devices = op->devices_list;
326 }
327
328
329
330
331
332
333
334 static void
335 undo_op_remap(remote_fencing_op_t *op)
336 {
337 if (op->phase > 0) {
338 crm_info("Undoing remap of reboot of %s for %s.%.8s",
339 op->target, op->client_name, op->id);
340 op->phase = st_phase_requested;
341 strcpy(op->action, "reboot");
342 }
343 }
344
345 static xmlNode *
346 create_op_done_notify(remote_fencing_op_t * op, int rc)
347 {
348 xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
349
350 crm_xml_add_int(notify_data, "state", op->state);
351 crm_xml_add_int(notify_data, F_STONITH_RC, rc);
352 crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
353 crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
354 crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
355 crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
356 crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
357 crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
358 crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
359
360 return notify_data;
361 }
362
363 static void
364 bcast_result_to_peers(remote_fencing_op_t * op, int rc)
365 {
366 static int count = 0;
367 xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
368 xmlNode *notify_data = create_op_done_notify(op, rc);
369
370 count++;
371 crm_trace("Broadcasting result to peers");
372 crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
373 crm_xml_add(bcast, F_SUBTYPE, "broadcast");
374 crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
375 crm_xml_add_int(bcast, "count", count);
376 add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
377 send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
378 free_xml(notify_data);
379 free_xml(bcast);
380
381 return;
382 }
383
384 static void
385 handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc)
386 {
387 xmlNode *notify_data = NULL;
388 xmlNode *reply = NULL;
389
390 if (op->notify_sent == TRUE) {
391
392 return;
393 }
394
395
396 notify_data = create_op_done_notify(op, rc);
397 crm_xml_add_int(data, "state", op->state);
398 crm_xml_add(data, F_STONITH_TARGET, op->target);
399 crm_xml_add(data, F_STONITH_OPERATION, op->action);
400
401 reply = stonith_construct_reply(op->request, NULL, data, rc);
402 crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
403
404
405 do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE);
406
407
408 do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data);
409
410
411 op->notify_sent = TRUE;
412 free_xml(reply);
413 free_xml(notify_data);
414 }
415
416 static void
417 handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc)
418 {
419 GListPtr iter = NULL;
420
421 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
422 remote_fencing_op_t *other = iter->data;
423
424 if (other->state == st_duplicate) {
425
426 other->state = op->state;
427 crm_debug("Peforming duplicate notification for %s@%s.%.8s = %s", other->client_name,
428 other->originator, other->id, pcmk_strerror(rc));
429 remote_op_done(other, data, rc, TRUE);
430
431 } else {
432 crm_err("Skipping duplicate notification for %s@%s - %d", other->client_name,
433 other->originator, other->state);
434 }
435 }
436 }
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463 static void
464 remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
465 {
466 int level = LOG_ERR;
467 const char *subt = NULL;
468 xmlNode *local_data = NULL;
469
470 op->completed = time(NULL);
471 clear_remote_op_timers(op);
472 undo_op_remap(op);
473
474 if (op->notify_sent == TRUE) {
475 crm_err("Already sent notifications for '%s of %s by %s' (for=%s@%s.%.8s, state=%d): %s",
476 op->action, op->target, op->delegate ? op->delegate : "<no-one>",
477 op->client_name, op->originator, op->id, op->state, pcmk_strerror(rc));
478 goto remote_op_done_cleanup;
479 }
480
481 if (!op->delegate && data && rc != -ENODEV && rc != -EHOSTUNREACH) {
482 xmlNode *ndata = get_xpath_object("//@" F_STONITH_DELEGATE, data, LOG_TRACE);
483 if(ndata) {
484 op->delegate = crm_element_value_copy(ndata, F_STONITH_DELEGATE);
485 } else {
486 op->delegate = crm_element_value_copy(data, F_ORIG);
487 }
488 }
489
490 if (data == NULL) {
491 data = create_xml_node(NULL, "remote-op");
492 local_data = data;
493 }
494
495
496
497
498 subt = crm_element_value(data, F_SUBTYPE);
499 if (dup == FALSE && safe_str_neq(subt, "broadcast")) {
500
501 bcast_result_to_peers(op, rc);
502 goto remote_op_done_cleanup;
503 }
504
505 if (rc == pcmk_ok || dup) {
506 level = LOG_NOTICE;
507 } else if (safe_str_neq(op->originator, stonith_our_uname)) {
508 level = LOG_NOTICE;
509 }
510
511 do_crm_log(level,
512 "Operation %s of %s by %s for %s@%s.%.8s: %s",
513 op->action, op->target, op->delegate ? op->delegate : "<no-one>",
514 op->client_name, op->originator, op->id, pcmk_strerror(rc));
515
516 handle_local_reply_and_notify(op, data, rc);
517
518 if (dup == FALSE) {
519 handle_duplicates(op, data, rc);
520 }
521
522
523
524
525 if (op->query_results) {
526 g_list_free_full(op->query_results, free_remote_query);
527 op->query_results = NULL;
528 }
529
530 if (op->request) {
531 free_xml(op->request);
532 op->request = NULL;
533 }
534
535 remote_op_done_cleanup:
536 free_xml(local_data);
537 }
538
539 static gboolean
540 remote_op_watchdog_done(gpointer userdata)
541 {
542 remote_fencing_op_t *op = userdata;
543
544 op->op_timer_one = 0;
545
546 crm_notice("Self-fencing (%s) by %s for %s.%8s assumed complete",
547 op->action, op->target, op->client_name, op->id);
548 op->state = st_done;
549 remote_op_done(op, NULL, pcmk_ok, FALSE);
550 return FALSE;
551 }
552
553 static gboolean
554 remote_op_timeout_one(gpointer userdata)
555 {
556 remote_fencing_op_t *op = userdata;
557
558 op->op_timer_one = 0;
559
560 crm_notice("Peer's fencing (%s) of %s for %s timed out" CRM_XS "id=%s",
561 op->action, op->target, op->client_name, op->id);
562 call_remote_stonith(op, NULL);
563 return FALSE;
564 }
565
566 static gboolean
567 remote_op_timeout(gpointer userdata)
568 {
569 remote_fencing_op_t *op = userdata;
570
571 op->op_timer_total = 0;
572
573 if (op->state == st_done) {
574 crm_debug("Action %s (%s) for %s (%s) already completed",
575 op->action, op->id, op->target, op->client_name);
576 return FALSE;
577 }
578
579 crm_debug("Action %s (%s) for %s (%s) timed out",
580 op->action, op->id, op->target, op->client_name);
581
582 if (op->phase == st_phase_on) {
583
584
585
586
587 remote_op_done(op, NULL, pcmk_ok, FALSE);
588 return FALSE;
589 }
590
591 op->state = st_failed;
592
593 remote_op_done(op, NULL, -ETIME, FALSE);
594
595 return FALSE;
596 }
597
598 static gboolean
599 remote_op_query_timeout(gpointer data)
600 {
601 remote_fencing_op_t *op = data;
602
603 op->query_timer = 0;
604 if (op->state == st_done) {
605 crm_debug("Operation %s for %s already completed", op->id, op->target);
606 } else if (op->state == st_exec) {
607 crm_debug("Operation %s for %s already in progress", op->id, op->target);
608 } else if (op->query_results) {
609 crm_debug("Query %s for %s complete: %d", op->id, op->target, op->state);
610 call_remote_stonith(op, NULL);
611 } else {
612 crm_debug("Query %s for %s timed out: %d", op->id, op->target, op->state);
613 if (op->op_timer_total) {
614 g_source_remove(op->op_timer_total);
615 op->op_timer_total = 0;
616 }
617 remote_op_timeout(op);
618 }
619
620 return FALSE;
621 }
622
623 static gboolean
624 topology_is_empty(stonith_topology_t *tp)
625 {
626 int i;
627
628 if (tp == NULL) {
629 return TRUE;
630 }
631
632 for (i = 0; i < ST_LEVEL_MAX; i++) {
633 if (tp->levels[i] != NULL) {
634 return FALSE;
635 }
636 }
637 return TRUE;
638 }
639
640
641
642
643
644
645
646
647 static void
648 add_required_device(remote_fencing_op_t *op, const char *device)
649 {
650 GListPtr match = g_list_find_custom(op->automatic_list, device,
651 sort_strings);
652
653 if (!match) {
654 op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
655 }
656 }
657
658
659
660
661
662
663
664
665 static void
666 remove_required_device(remote_fencing_op_t *op, const char *device)
667 {
668 GListPtr match = g_list_find_custom(op->automatic_list, device,
669 sort_strings);
670
671 if (match) {
672 op->automatic_list = g_list_remove(op->automatic_list, match->data);
673 }
674 }
675
676
677 static void
678 set_op_device_list(remote_fencing_op_t * op, GListPtr devices)
679 {
680 GListPtr lpc = NULL;
681
682 if (op->devices_list) {
683 g_list_free_full(op->devices_list, free);
684 op->devices_list = NULL;
685 }
686 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
687 op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
688 }
689 op->devices = op->devices_list;
690 }
691
692
693
694
695
696
697
698
699
700
701 static gboolean
702 topology_matches(const stonith_topology_t *tp, const char *node)
703 {
704 regex_t r_patt;
705
706 CRM_CHECK(node && tp && tp->target, return FALSE);
707 switch(tp->kind) {
708 case 2:
709
710
711
712
713
714
715 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
716 crm_notice("Matched %s with %s by attribute", node, tp->target);
717 return TRUE;
718 }
719 break;
720 case 1:
721
722
723
724
725 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
726 crm_info("Bad regex '%s' for fencing level", tp->target);
727 } else {
728 int status = regexec(&r_patt, node, 0, NULL, 0);
729
730 regfree(&r_patt);
731 if (status == 0) {
732 crm_notice("Matched %s with %s by name", node, tp->target);
733 return TRUE;
734 }
735 }
736 break;
737 case 0:
738 crm_trace("Testing %s against %s", node, tp->target);
739 return safe_str_eq(tp->target, node);
740 }
741 crm_trace("No match for %s with %s", node, tp->target);
742 return FALSE;
743 }
744
745 stonith_topology_t *
746 find_topology_for_host(const char *host)
747 {
748 GHashTableIter tIter;
749 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
750
751 if(tp != NULL) {
752 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
753 return tp;
754 }
755
756 g_hash_table_iter_init(&tIter, topology);
757 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
758 if (topology_matches(tp, host)) {
759 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
760 return tp;
761 }
762 }
763
764 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
765 return NULL;
766 }
767
768
769
770
771
772
773
774
775
776
777 static int
778 stonith_topology_next(remote_fencing_op_t * op)
779 {
780 stonith_topology_t *tp = NULL;
781
782 if (op->target) {
783
784 tp = find_topology_for_host(op->target);
785 }
786 if (topology_is_empty(tp)) {
787 return pcmk_ok;
788 }
789
790 set_bit(op->call_options, st_opt_topology);
791
792
793 undo_op_remap(op);
794
795 do {
796 op->level++;
797
798 } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
799
800 if (op->level < ST_LEVEL_MAX) {
801 crm_trace("Attempting fencing level %d for %s (%d devices) - %s@%s.%.8s",
802 op->level, op->target, g_list_length(tp->levels[op->level]),
803 op->client_name, op->originator, op->id);
804 set_op_device_list(op, tp->levels[op->level]);
805
806 if (g_list_next(op->devices_list) && safe_str_eq(op->action, "reboot")) {
807
808
809
810
811
812 op_phase_off(op);
813 }
814 return pcmk_ok;
815 }
816
817 crm_notice("All fencing options to fence %s for %s@%s.%.8s failed",
818 op->target, op->client_name, op->originator, op->id);
819 return -EINVAL;
820 }
821
822
823
824
825
826
827 static void
828 merge_duplicates(remote_fencing_op_t * op)
829 {
830 GHashTableIter iter;
831 remote_fencing_op_t *other = NULL;
832
833 time_t now = time(NULL);
834
835 g_hash_table_iter_init(&iter, remote_op_list);
836 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
837 crm_node_t *peer = NULL;
838 const char *other_action = op_requested_action(other);
839
840 if (other->state > st_exec) {
841
842 continue;
843 } else if (safe_str_neq(op->target, other->target)) {
844
845 continue;
846 } else if (safe_str_neq(op->action, other_action)) {
847 crm_trace("Must be for the same action: %s vs. %s",
848 op->action, other_action);
849 continue;
850 } else if (safe_str_eq(op->client_name, other->client_name)) {
851 crm_trace("Must be for different clients: %s", op->client_name);
852 continue;
853 } else if (safe_str_eq(other->target, other->originator)) {
854 crm_trace("Can't be a suicide operation: %s", other->target);
855 continue;
856 }
857
858 peer = crm_get_peer(0, other->originator);
859 if(fencing_peer_active(peer) == FALSE) {
860 crm_notice("Failing stonith action %s for node %s originating from %s@%s.%.8s: Originator is dead",
861 other->action, other->target, other->client_name, other->originator, other->id);
862 other->state = st_failed;
863 continue;
864
865 } else if(other->total_timeout > 0 && now > (other->total_timeout + other->created)) {
866 crm_info("Stonith action %s for node %s originating from %s@%s.%.8s is too old: %d vs. %d + %d",
867 other->action, other->target, other->client_name, other->originator, other->id,
868 now, other->created, other->total_timeout);
869 continue;
870 }
871
872
873
874
875 other->duplicates = g_list_append(other->duplicates, op);
876 if (other->total_timeout == 0) {
877 crm_trace("Making a best-guess as to the timeout used");
878 other->total_timeout = op->total_timeout =
879 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
880 }
881 crm_notice
882 ("Merging stonith action %s for node %s originating from client %s.%.8s with identical request from %s@%s.%.8s (%ds)",
883 op->action, op->target, op->client_name, op->id, other->client_name, other->originator,
884 other->id, other->total_timeout);
885 report_timeout_period(op, other->total_timeout);
886 op->state = st_duplicate;
887 }
888 }
889
890 static uint32_t fencing_active_peers(void)
891 {
892 uint32_t count = 0;
893 crm_node_t *entry;
894 GHashTableIter gIter;
895
896 g_hash_table_iter_init(&gIter, crm_peer_cache);
897 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
898 if(fencing_peer_active(entry)) {
899 count++;
900 }
901 }
902 return count;
903 }
904
905 int
906 stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op)
907 {
908 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
909
910 op->state = st_done;
911 op->completed = time(NULL);
912 op->delegate = strdup("a human");
913
914 crm_notice("Injecting manual confirmation that %s is safely off/down",
915 crm_element_value(dev, F_STONITH_TARGET));
916
917 remote_op_done(op, msg, pcmk_ok, FALSE);
918
919
920 return -EINPROGRESS;
921 }
922
923 char *
924 stonith_get_peer_name(unsigned int nodeid)
925 {
926 crm_node_t *node = crm_find_peer(nodeid, NULL);
927 char *nodename = NULL;
928
929 if (node && node->uname) {
930 return strdup(node->uname);
931
932 } else if ((nodename = get_node_name(nodeid))) {
933 return nodename;
934
935 } else {
936 const char *last_known_name = g_hash_table_lookup(known_peer_names, GUINT_TO_POINTER(nodeid));
937
938 if (last_known_name) {
939 crm_debug("Use the last known name %s for nodeid %u", last_known_name, nodeid);
940 return strdup(last_known_name);
941 }
942 }
943
944 return NULL;
945 }
946
947
948
949
950
951
952
953
954
955
956
957
958 void *
959 create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer)
960 {
961 remote_fencing_op_t *op = NULL;
962 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_TRACE);
963 int call_options = 0;
964
965 if (remote_op_list == NULL) {
966 remote_op_list = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_remote_op);
967 }
968
969
970
971 if (peer && dev) {
972 const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
973
974 CRM_CHECK(op_id != NULL, return NULL);
975
976 op = g_hash_table_lookup(remote_op_list, op_id);
977 if (op) {
978 crm_debug("%s already exists", op_id);
979 return op;
980 }
981 }
982
983 op = calloc(1, sizeof(remote_fencing_op_t));
984
985 crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
986
987 if (peer && dev) {
988 op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
989 } else {
990 op->id = crm_generate_uuid();
991 }
992
993 g_hash_table_replace(remote_op_list, op->id, op);
994 CRM_LOG_ASSERT(g_hash_table_lookup(remote_op_list, op->id) != NULL);
995 crm_trace("Created %s", op->id);
996
997 op->state = st_query;
998 op->replies_expected = fencing_active_peers();
999 op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1000 op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1001 op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE);
1002 op->created = time(NULL);
1003
1004 if (op->originator == NULL) {
1005
1006 op->originator = strdup(stonith_our_uname);
1007 }
1008
1009 CRM_LOG_ASSERT(client != NULL);
1010 if (client) {
1011 op->client_id = strdup(client);
1012 }
1013
1014 op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1015
1016 op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1017 op->request = copy_xml(request);
1018 crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1019 op->call_options = call_options;
1020
1021 crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1022
1023 crm_trace("%s new stonith op: %s - %s of %s for %s",
1024 (peer
1025 && dev) ? "Recorded" : "Generated", op->id, op->action, op->target, op->client_name);
1026
1027 if (op->call_options & st_opt_cs_nodeid) {
1028 int nodeid = crm_atoi(op->target, NULL);
1029 char *nodename = stonith_get_peer_name(nodeid);
1030
1031
1032 op->call_options &= ~st_opt_cs_nodeid;
1033
1034 if (nodename) {
1035 free(op->target);
1036 op->target = nodename;
1037
1038 } else {
1039 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1040 }
1041 }
1042
1043
1044 merge_duplicates(op);
1045
1046 return op;
1047 }
1048
1049 remote_fencing_op_t *
1050 initiate_remote_stonith_op(crm_client_t * client, xmlNode * request, gboolean manual_ack)
1051 {
1052 int query_timeout = 0;
1053 xmlNode *query = NULL;
1054 const char *client_id = NULL;
1055 remote_fencing_op_t *op = NULL;
1056
1057 if (client) {
1058 client_id = client->id;
1059 } else {
1060 client_id = crm_element_value(request, F_STONITH_CLIENTID);
1061 }
1062
1063 CRM_LOG_ASSERT(client_id != NULL);
1064 op = create_remote_stonith_op(client_id, request, FALSE);
1065 op->owner = TRUE;
1066 if (manual_ack) {
1067 crm_notice("Initiating manual confirmation for %s: %s",
1068 op->target, op->id);
1069 return op;
1070 }
1071
1072 CRM_CHECK(op->action, return NULL);
1073
1074 if (stonith_topology_next(op) != pcmk_ok) {
1075 op->state = st_failed;
1076 }
1077
1078 switch (op->state) {
1079 case st_failed:
1080 crm_warn("Could not request peer fencing (%s) of %s "
1081 CRM_XS " id=%s", op->action, op->target, op->id);
1082 remote_op_done(op, NULL, -EINVAL, FALSE);
1083 return op;
1084
1085 case st_duplicate:
1086 crm_info("Requesting peer fencing (%s) of %s (duplicate) "
1087 CRM_XS " id=%s", op->action, op->target, op->id);
1088 return op;
1089
1090 default:
1091 crm_notice("Requesting peer fencing (%s) of %s "
1092 CRM_XS " id=%s state=%d",
1093 op->action, op->target, op->id, op->state);
1094 }
1095
1096 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1097 NULL, op->call_options);
1098
1099 crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1100 crm_xml_add(query, F_STONITH_TARGET, op->target);
1101 crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1102 crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1103 crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1104 crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1105 crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1106
1107 send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1108 free_xml(query);
1109
1110 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1111 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1112
1113 return op;
1114 }
1115
1116 enum find_best_peer_options {
1117
1118 FIND_PEER_SKIP_TARGET = 0x0001,
1119
1120 FIND_PEER_TARGET_ONLY = 0x0002,
1121
1122 FIND_PEER_VERIFIED_ONLY = 0x0004,
1123 };
1124
1125 static st_query_result_t *
1126 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1127 {
1128 GListPtr iter = NULL;
1129 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1130
1131 if (!device && is_set(op->call_options, st_opt_topology)) {
1132 return NULL;
1133 }
1134
1135 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1136 st_query_result_t *peer = iter->data;
1137
1138 crm_trace("Testing result from %s for %s with %d devices: %d %x",
1139 peer->host, op->target, peer->ndevices, peer->tried, options);
1140 if ((options & FIND_PEER_SKIP_TARGET) && safe_str_eq(peer->host, op->target)) {
1141 continue;
1142 }
1143 if ((options & FIND_PEER_TARGET_ONLY) && safe_str_neq(peer->host, op->target)) {
1144 continue;
1145 }
1146
1147 if (is_set(op->call_options, st_opt_topology)) {
1148
1149 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1150 return peer;
1151 }
1152
1153 } else if ((peer->tried == FALSE)
1154 && count_peer_devices(op, peer, verified_devices_only)) {
1155
1156
1157 crm_trace("Simple fencing");
1158 return peer;
1159 }
1160 }
1161
1162 return NULL;
1163 }
1164
1165 static st_query_result_t *
1166 stonith_choose_peer(remote_fencing_op_t * op)
1167 {
1168 const char *device = NULL;
1169 st_query_result_t *peer = NULL;
1170 uint32_t active = fencing_active_peers();
1171
1172 do {
1173 if (op->devices) {
1174 device = op->devices->data;
1175 crm_trace("Checking for someone to fence (%s) %s with %s",
1176 op->action, op->target, device);
1177 } else {
1178 crm_trace("Checking for someone to fence (%s) %s",
1179 op->action, op->target);
1180 }
1181
1182
1183 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1184 if (peer) {
1185 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1186 return peer;
1187 }
1188
1189 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1190 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1191 return NULL;
1192 }
1193
1194
1195 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1196 if (peer) {
1197 crm_trace("Found best unverified peer %s", peer->host);
1198 return peer;
1199 }
1200
1201
1202
1203
1204 if (op->phase != st_phase_on) {
1205 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1206 if (peer) {
1207 crm_trace("%s will fence itself", peer->host);
1208 return peer;
1209 }
1210 }
1211
1212
1213
1214
1215 } while ((op->phase != st_phase_on)
1216 && is_set(op->call_options, st_opt_topology)
1217 && stonith_topology_next(op) == pcmk_ok);
1218
1219 crm_notice("Couldn't find anyone to fence (%s) %s with %s",
1220 op->action, op->target, (device? device : "any device"));
1221 return NULL;
1222 }
1223
1224 static int
1225 get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer,
1226 const char *device)
1227 {
1228 device_properties_t *props;
1229
1230 if (!peer || !device) {
1231 return op->base_timeout;
1232 }
1233
1234 props = g_hash_table_lookup(peer->devices, device);
1235 if (!props) {
1236 return op->base_timeout;
1237 }
1238
1239 return (props->custom_action_timeout[op->phase]?
1240 props->custom_action_timeout[op->phase] : op->base_timeout)
1241 + props->delay_max[op->phase];
1242 }
1243
1244 struct timeout_data {
1245 const remote_fencing_op_t *op;
1246 const st_query_result_t *peer;
1247 int total_timeout;
1248 };
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258 static void
1259 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1260 {
1261 const char *device_id = key;
1262 device_properties_t *props = value;
1263 struct timeout_data *timeout = user_data;
1264
1265 if (!props->executed[timeout->op->phase]
1266 && !props->disallowed[timeout->op->phase]) {
1267 timeout->total_timeout += get_device_timeout(timeout->op,
1268 timeout->peer, device_id);
1269 }
1270 }
1271
1272 static int
1273 get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer)
1274 {
1275 struct timeout_data timeout;
1276
1277 timeout.op = op;
1278 timeout.peer = peer;
1279 timeout.total_timeout = 0;
1280
1281 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1282
1283 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1284 }
1285
1286 static int
1287 get_op_total_timeout(const remote_fencing_op_t *op,
1288 const st_query_result_t *chosen_peer)
1289 {
1290 int total_timeout = 0;
1291 stonith_topology_t *tp = find_topology_for_host(op->target);
1292
1293 if (is_set(op->call_options, st_opt_topology) && tp) {
1294 int i;
1295 GListPtr device_list = NULL;
1296 GListPtr iter = NULL;
1297
1298
1299
1300
1301
1302
1303
1304
1305 for (i = 0; i < ST_LEVEL_MAX; i++) {
1306 if (!tp->levels[i]) {
1307 continue;
1308 }
1309 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1310 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1311 const st_query_result_t *peer = iter->data;
1312
1313 if (find_peer_device(op, peer, device_list->data)) {
1314 total_timeout += get_device_timeout(op, peer,
1315 device_list->data);
1316 break;
1317 }
1318 }
1319 }
1320 }
1321
1322 } else if (chosen_peer) {
1323 total_timeout = get_peer_timeout(op, chosen_peer);
1324 } else {
1325 total_timeout = op->base_timeout;
1326 }
1327
1328 return total_timeout ? total_timeout : op->base_timeout;
1329 }
1330
1331 static void
1332 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1333 {
1334 GListPtr iter = NULL;
1335 xmlNode *update = NULL;
1336 const char *client_node = NULL;
1337 const char *client_id = NULL;
1338 const char *call_id = NULL;
1339
1340 if (op->call_options & st_opt_sync_call) {
1341
1342
1343
1344
1345 return;
1346 } else if (!op->request) {
1347 return;
1348 }
1349
1350 crm_trace("Reporting timeout for %s.%.8s", op->client_name, op->id);
1351 client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1352 call_id = crm_element_value(op->request, F_STONITH_CALLID);
1353 client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1354 if (!client_node || !call_id || !client_id) {
1355 return;
1356 }
1357
1358 if (safe_str_eq(client_node, stonith_our_uname)) {
1359
1360 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1361 return;
1362 }
1363
1364
1365 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1366 crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1367 crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1368 crm_xml_add(update, F_STONITH_CALLID, call_id);
1369 crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1370
1371 send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1372
1373 free_xml(update);
1374
1375 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1376 remote_fencing_op_t *dup = iter->data;
1377
1378 crm_trace("Reporting timeout for duplicate %s.%.8s", dup->client_name, dup->id);
1379 report_timeout_period(iter->data, op_timeout);
1380 }
1381 }
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392 static void
1393 advance_op_topology(remote_fencing_op_t *op, const char *device, xmlNode *msg,
1394 int rc)
1395 {
1396
1397 if (op->devices) {
1398 op->devices = op->devices->next;
1399 }
1400
1401
1402 if ((op->phase == st_phase_requested) && safe_str_eq(op->action, "on")) {
1403
1404 remove_required_device(op, device);
1405
1406
1407
1408
1409 if (op->devices == NULL) {
1410 op->devices = op->automatic_list;
1411 }
1412 }
1413
1414 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1415
1416
1417
1418
1419 op_phase_on(op);
1420 }
1421
1422 if (op->devices) {
1423
1424 crm_trace("Next for %s on behalf of %s@%s (rc was %d)",
1425 op->target, op->originator, op->client_name, rc);
1426 call_remote_stonith(op, NULL);
1427 } else {
1428
1429 crm_trace("Marking complex fencing op for %s as complete", op->target);
1430 op->state = st_done;
1431 remote_op_done(op, msg, rc, FALSE);
1432 }
1433 }
1434
1435 void
1436 call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer)
1437 {
1438 const char *device = NULL;
1439 int timeout = op->base_timeout;
1440
1441 crm_trace("State for %s.%.8s: %s %d", op->target, op->client_name, op->id, op->state);
1442 if (peer == NULL && !is_set(op->call_options, st_opt_topology)) {
1443 peer = stonith_choose_peer(op);
1444 }
1445
1446 if (!op->op_timer_total) {
1447 int total_timeout = get_op_total_timeout(op, peer);
1448
1449 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
1450 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1451 report_timeout_period(op, op->total_timeout);
1452 crm_info("Total timeout set to %d for peer's fencing of %s for %s"
1453 CRM_XS "id=%s",
1454 total_timeout, op->target, op->client_name, op->id);
1455 }
1456
1457 if (is_set(op->call_options, st_opt_topology) && op->devices) {
1458
1459
1460
1461 peer = stonith_choose_peer(op);
1462
1463 device = op->devices->data;
1464 timeout = get_device_timeout(op, peer, device);
1465 }
1466
1467 if (peer) {
1468 int timeout_one = 0;
1469 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1470
1471 crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1472 crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1473 crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1474 crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1475 crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1476 crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1477 crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1478 crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1479
1480 if (device) {
1481 timeout_one = TIMEOUT_MULTIPLY_FACTOR *
1482 get_device_timeout(op, peer, device);
1483 crm_info("Requesting that '%s' perform op '%s %s' with '%s' for %s (%ds)", peer->host,
1484 op->target, op->action, device, op->client_name, timeout_one);
1485 crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1486 crm_xml_add(remote_op, F_STONITH_MODE, "slave");
1487
1488 } else {
1489 timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1490 crm_info("Requesting that '%s' perform op '%s %s' for %s (%ds, %ds)",
1491 peer->host, op->target, op->action, op->client_name, timeout_one, stonith_watchdog_timeout_ms);
1492 crm_xml_add(remote_op, F_STONITH_MODE, "smart");
1493
1494 }
1495
1496 op->state = st_exec;
1497 if (op->op_timer_one) {
1498 g_source_remove(op->op_timer_one);
1499 }
1500
1501 if(stonith_watchdog_timeout_ms > 0 && device && safe_str_eq(device, "watchdog")) {
1502 crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)",
1503 stonith_watchdog_timeout_ms/1000, op->target,
1504 op->action, op->client_name, op->id, device);
1505 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1506
1507
1508 } else if(stonith_watchdog_timeout_ms > 0
1509 && safe_str_eq(peer->host, op->target)
1510 && safe_str_neq(op->action, "on")) {
1511 crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)",
1512 stonith_watchdog_timeout_ms/1000, op->target,
1513 op->action, op->client_name, op->id, device);
1514 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1515
1516 } else {
1517 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1518 }
1519
1520
1521 send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1522 peer->tried = TRUE;
1523 free_xml(remote_op);
1524 return;
1525
1526 } else if (op->phase == st_phase_on) {
1527
1528
1529
1530 crm_warn("Ignoring %s 'on' failure (no capable peers) for %s after successful 'off'",
1531 device, op->target);
1532 advance_op_topology(op, device, NULL, pcmk_ok);
1533 return;
1534
1535 } else if (op->owner == FALSE) {
1536 crm_err("Fencing (%s) of %s for %s is not ours to control",
1537 op->action, op->target, op->client_name);
1538
1539 } else if (op->query_timer == 0) {
1540
1541 crm_info("No remaining peers capable of fencing (%s) %s for %s (%d)",
1542 op->target, op->action, op->client_name, op->state);
1543 CRM_LOG_ASSERT(op->state < st_done);
1544 remote_op_timeout(op);
1545
1546 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1547 int rc = -EHOSTUNREACH;
1548
1549
1550
1551
1552
1553 if(stonith_watchdog_timeout_ms && (device == NULL || safe_str_eq(device, "watchdog"))) {
1554 crm_notice("Waiting %ds for %s to self-fence (%s) for %s.%.8s (%p)",
1555 stonith_watchdog_timeout_ms/1000, op->target,
1556 op->action, op->client_name, op->id, device);
1557
1558 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1559 return;
1560 }
1561
1562 if (op->state == st_query) {
1563 crm_info("No peers (out of %d) have devices capable of fencing (%s) %s for %s (%d)",
1564 op->replies, op->action, op->target, op->client_name,
1565 op->state);
1566
1567 rc = -ENODEV;
1568 } else {
1569 crm_info("No peers (out of %d) are capable of fencing (%s) %s for %s (%d)",
1570 op->replies, op->action, op->target, op->client_name,
1571 op->state);
1572 }
1573
1574 op->state = st_failed;
1575 remote_op_done(op, NULL, rc, FALSE);
1576
1577 } else if (device) {
1578 crm_info("Waiting for additional peers capable of fencing (%s) %s with %s for %s.%.8s",
1579 op->action, op->target, device, op->client_name, op->id);
1580 } else {
1581 crm_info("Waiting for additional peers capable of fencing (%s) %s for %s%.8s",
1582 op->action, op->target, op->client_name, op->id);
1583 }
1584 }
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597 static gint
1598 sort_peers(gconstpointer a, gconstpointer b)
1599 {
1600 const st_query_result_t *peer_a = a;
1601 const st_query_result_t *peer_b = b;
1602
1603 return (peer_b->ndevices - peer_a->ndevices);
1604 }
1605
1606
1607
1608
1609
1610 static gboolean
1611 all_topology_devices_found(remote_fencing_op_t * op)
1612 {
1613 GListPtr device = NULL;
1614 GListPtr iter = NULL;
1615 device_properties_t *match = NULL;
1616 stonith_topology_t *tp = NULL;
1617 gboolean skip_target = FALSE;
1618 int i;
1619
1620 tp = find_topology_for_host(op->target);
1621 if (!tp) {
1622 return FALSE;
1623 }
1624 if (safe_str_eq(op->action, "off") || safe_str_eq(op->action, "reboot")) {
1625
1626
1627 skip_target = TRUE;
1628 }
1629
1630 for (i = 0; i < ST_LEVEL_MAX; i++) {
1631 for (device = tp->levels[i]; device; device = device->next) {
1632 match = NULL;
1633 for (iter = op->query_results; iter && !match; iter = iter->next) {
1634 st_query_result_t *peer = iter->data;
1635
1636 if (skip_target && safe_str_eq(peer->host, op->target)) {
1637 continue;
1638 }
1639 match = find_peer_device(op, peer, device->data);
1640 }
1641 if (!match) {
1642 return FALSE;
1643 }
1644 }
1645 }
1646
1647 return TRUE;
1648 }
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661 static void
1662 parse_action_specific(xmlNode *xml, const char *peer, const char *device,
1663 const char *action, remote_fencing_op_t *op,
1664 enum st_remap_phase phase, device_properties_t *props)
1665 {
1666 props->custom_action_timeout[phase] = 0;
1667 crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
1668 &props->custom_action_timeout[phase]);
1669 if (props->custom_action_timeout[phase]) {
1670 crm_trace("Peer %s with device %s returned %s action timeout %d",
1671 peer, device, action, props->custom_action_timeout[phase]);
1672 }
1673
1674 props->delay_max[phase] = 0;
1675 crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
1676 if (props->delay_max[phase]) {
1677 crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
1678 peer, device, props->delay_max[phase], action);
1679 }
1680
1681 props->delay_base[phase] = 0;
1682 crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
1683 if (props->delay_base[phase]) {
1684 crm_trace("Peer %s with device %s returned base delay %d for %s",
1685 peer, device, props->delay_base[phase], action);
1686 }
1687
1688
1689 if (safe_str_eq(action, "on")) {
1690 int required = 0;
1691
1692 crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
1693 if (required) {
1694 crm_trace("Peer %s requires device %s to execute for action %s",
1695 peer, device, action);
1696 add_required_device(op, device);
1697 }
1698 }
1699
1700
1701
1702
1703 if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) {
1704 props->disallowed[phase] = TRUE;
1705 crm_trace("Peer %s is disallowed from executing %s for device %s",
1706 peer, action, device);
1707 }
1708 }
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719 static void
1720 add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
1721 st_query_result_t *result, const char *device)
1722 {
1723 xmlNode *child;
1724 int verified = 0;
1725 device_properties_t *props = calloc(1, sizeof(device_properties_t));
1726
1727
1728 CRM_ASSERT(props != NULL);
1729 g_hash_table_insert(result->devices, strdup(device), props);
1730
1731
1732 crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
1733 if (verified) {
1734 crm_trace("Peer %s has confirmed a verified device %s",
1735 result->host, device);
1736 props->verified = TRUE;
1737 }
1738
1739
1740 parse_action_specific(xml, result->host, device, op_requested_action(op),
1741 op, st_phase_requested, props);
1742 for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
1743
1744
1745
1746
1747 if (safe_str_eq(ID(child), "off")) {
1748 parse_action_specific(child, result->host, device, "off",
1749 op, st_phase_off, props);
1750 } else if (safe_str_eq(ID(child), "on")) {
1751 parse_action_specific(child, result->host, device, "on",
1752 op, st_phase_on, props);
1753 }
1754 }
1755 }
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768 static st_query_result_t *
1769 add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
1770 {
1771 st_query_result_t *result = calloc(1, sizeof(st_query_result_t));
1772 xmlNode *child;
1773
1774 CRM_CHECK(result != NULL, return NULL);
1775 result->host = strdup(host);
1776 result->devices = crm_str_table_new();
1777
1778
1779 for (child = __xml_first_child(xml); child != NULL; child = __xml_next(child)) {
1780 const char *device = ID(child);
1781
1782 if (device) {
1783 add_device_properties(child, op, result, device);
1784 }
1785 }
1786
1787 result->ndevices = g_hash_table_size(result->devices);
1788 CRM_CHECK(ndevices == result->ndevices,
1789 crm_err("Query claimed to have %d devices but %d found",
1790 ndevices, result->ndevices));
1791
1792 op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
1793 return result;
1794 }
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810 int
1811 process_remote_stonith_query(xmlNode * msg)
1812 {
1813 int ndevices = 0;
1814 gboolean host_is_target = FALSE;
1815 gboolean have_all_replies = FALSE;
1816 const char *id = NULL;
1817 const char *host = NULL;
1818 remote_fencing_op_t *op = NULL;
1819 st_query_result_t *result = NULL;
1820 uint32_t replies_expected;
1821 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
1822
1823 CRM_CHECK(dev != NULL, return -EPROTO);
1824
1825 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1826 CRM_CHECK(id != NULL, return -EPROTO);
1827
1828 dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
1829 CRM_CHECK(dev != NULL, return -EPROTO);
1830 crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
1831
1832 op = g_hash_table_lookup(remote_op_list, id);
1833 if (op == NULL) {
1834 crm_debug("Received query reply for unknown or expired operation %s",
1835 id);
1836 return -EOPNOTSUPP;
1837 }
1838
1839 replies_expected = QB_MIN(op->replies_expected, fencing_active_peers());
1840 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
1841 have_all_replies = TRUE;
1842 }
1843 host = crm_element_value(msg, F_ORIG);
1844 host_is_target = safe_str_eq(host, op->target);
1845
1846 crm_info("Query result %d of %d from %s for %s/%s (%d devices) %s",
1847 op->replies, replies_expected, host,
1848 op->target, op->action, ndevices, id);
1849 if (ndevices > 0) {
1850 result = add_result(op, host, ndevices, dev);
1851 }
1852
1853 if (is_set(op->call_options, st_opt_topology)) {
1854
1855
1856
1857 if (op->state == st_query && all_topology_devices_found(op)) {
1858
1859 crm_trace("All topology devices found");
1860 call_remote_stonith(op, result);
1861
1862 } else if (have_all_replies) {
1863 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
1864 replies_expected, op->replies);
1865 call_remote_stonith(op, NULL);
1866 }
1867
1868 } else if (op->state == st_query) {
1869 int nverified = count_peer_devices(op, result, TRUE);
1870
1871
1872
1873 if (result && (host_is_target == FALSE) && nverified) {
1874
1875 crm_trace("Found %d verified devices", nverified);
1876 call_remote_stonith(op, result);
1877
1878 } else if (have_all_replies) {
1879 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
1880 replies_expected, op->replies);
1881 call_remote_stonith(op, NULL);
1882
1883 } else {
1884 crm_trace("Waiting for more peer results before launching fencing operation");
1885 }
1886
1887 } else if (result && (op->state == st_done)) {
1888 crm_info("Discarding query result from %s (%d devices): Operation is in state %d",
1889 result->host, result->ndevices, op->state);
1890 }
1891
1892 return pcmk_ok;
1893 }
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906 int
1907 process_remote_stonith_exec(xmlNode * msg)
1908 {
1909 int rc = 0;
1910 const char *id = NULL;
1911 const char *device = NULL;
1912 remote_fencing_op_t *op = NULL;
1913 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
1914
1915 CRM_CHECK(dev != NULL, return -EPROTO);
1916
1917 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1918 CRM_CHECK(id != NULL, return -EPROTO);
1919
1920 dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
1921 CRM_CHECK(dev != NULL, return -EPROTO);
1922
1923 crm_element_value_int(dev, F_STONITH_RC, &rc);
1924
1925 device = crm_element_value(dev, F_STONITH_DEVICE);
1926
1927 if (remote_op_list) {
1928 op = g_hash_table_lookup(remote_op_list, id);
1929 }
1930
1931 if (op == NULL && rc == pcmk_ok) {
1932
1933 const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
1934
1935 op = create_remote_stonith_op(client_id, dev, TRUE);
1936 }
1937
1938 if (op == NULL) {
1939
1940
1941 crm_info("Received peer result of unknown or expired operation %s", id);
1942 return -EOPNOTSUPP;
1943 }
1944
1945 if (op->devices && device && safe_str_neq(op->devices->data, device)) {
1946 crm_err("Received outdated reply for device %s (instead of %s) to "
1947 "fence (%s) %s. Operation already timed out at peer level.",
1948 device, op->devices->data, op->action, op->target);
1949 return rc;
1950 }
1951
1952 if (safe_str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast")) {
1953 crm_debug("Marking call to %s for %s on behalf of %s@%s.%.8s: %s (%d)",
1954 op->action, op->target, op->client_name, op->id, op->originator,
1955 pcmk_strerror(rc), rc);
1956 if (rc == pcmk_ok) {
1957 op->state = st_done;
1958 } else {
1959 op->state = st_failed;
1960 }
1961 remote_op_done(op, msg, rc, FALSE);
1962 return pcmk_ok;
1963 } else if (safe_str_neq(op->originator, stonith_our_uname)) {
1964
1965
1966 crm_err
1967 ("%s received non-broadcast fencing result for operation it does not own (device %s targeting %s)",
1968 stonith_our_uname, device, op->target);
1969 return rc;
1970 }
1971
1972 if (is_set(op->call_options, st_opt_topology)) {
1973 const char *device = crm_element_value(msg, F_STONITH_DEVICE);
1974
1975 crm_notice("Call to %s for '%s %s' on behalf of %s@%s: %s (%d)",
1976 device, op->target, op->action, op->client_name, op->originator,
1977 pcmk_strerror(rc), rc);
1978
1979
1980
1981 if (op->state == st_done) {
1982 remote_op_done(op, msg, rc, FALSE);
1983 return rc;
1984 }
1985
1986 if ((op->phase == 2) && (rc != pcmk_ok)) {
1987
1988
1989
1990 crm_warn("Ignoring %s 'on' failure (exit code %d) for %s after successful 'off'",
1991 device, rc, op->target);
1992 rc = pcmk_ok;
1993 }
1994
1995 if (rc == pcmk_ok) {
1996
1997
1998 advance_op_topology(op, device, msg, rc);
1999 return rc;
2000 } else {
2001
2002
2003 if (stonith_topology_next(op) != pcmk_ok) {
2004 op->state = st_failed;
2005 remote_op_done(op, msg, rc, FALSE);
2006 return rc;
2007 }
2008 }
2009 } else if (rc == pcmk_ok && op->devices == NULL) {
2010 crm_trace("All done for %s", op->target);
2011
2012 op->state = st_done;
2013 remote_op_done(op, msg, rc, FALSE);
2014 return rc;
2015 } else if (rc == -ETIME && op->devices == NULL) {
2016
2017 op->state = st_failed;
2018 remote_op_done(op, msg, rc, FALSE);
2019 return rc;
2020 } else {
2021
2022 }
2023
2024
2025 crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
2026 op->client_name, rc);
2027 call_remote_stonith(op, NULL);
2028 return rc;
2029 }
2030
2031 int
2032 stonith_fence_history(xmlNode * msg, xmlNode ** output)
2033 {
2034 int rc = 0;
2035 const char *target = NULL;
2036 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_TRACE);
2037 char *nodename = NULL;
2038
2039 if (dev) {
2040 int options = 0;
2041
2042 target = crm_element_value(dev, F_STONITH_TARGET);
2043 crm_element_value_int(msg, F_STONITH_CALLOPTS, &options);
2044 if (target && (options & st_opt_cs_nodeid)) {
2045 int nodeid = crm_atoi(target, NULL);
2046
2047 nodename = stonith_get_peer_name(nodeid);
2048 if (nodename) {
2049 target = nodename;
2050 }
2051 }
2052 }
2053
2054 crm_trace("Looking for operations on %s in %p", target, remote_op_list);
2055
2056 *output = create_xml_node(NULL, F_STONITH_HISTORY_LIST);
2057 if (remote_op_list) {
2058 GHashTableIter iter;
2059 remote_fencing_op_t *op = NULL;
2060
2061 g_hash_table_iter_init(&iter, remote_op_list);
2062 while (g_hash_table_iter_next(&iter, NULL, (void **)&op)) {
2063 xmlNode *entry = NULL;
2064
2065 if (target && strcmp(op->target, target) != 0) {
2066 continue;
2067 }
2068
2069 rc = 0;
2070 crm_trace("Attaching op %s", op->id);
2071 entry = create_xml_node(*output, STONITH_OP_EXEC);
2072 crm_xml_add(entry, F_STONITH_TARGET, op->target);
2073 crm_xml_add(entry, F_STONITH_ACTION, op->action);
2074 crm_xml_add(entry, F_STONITH_ORIGIN, op->originator);
2075 crm_xml_add(entry, F_STONITH_DELEGATE, op->delegate);
2076 crm_xml_add(entry, F_STONITH_CLIENTNAME, op->client_name);
2077 crm_xml_add_int(entry, F_STONITH_DATE, op->completed);
2078 crm_xml_add_int(entry, F_STONITH_STATE, op->state);
2079 }
2080 }
2081
2082 free(nodename);
2083 return rc;
2084 }
2085
2086 gboolean
2087 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2088 {
2089 GHashTableIter iter;
2090 time_t now = time(NULL);
2091 remote_fencing_op_t *rop = NULL;
2092
2093 crm_trace("tolerance=%d, remote_op_list=%p", tolerance, remote_op_list);
2094
2095 if (tolerance <= 0 || !remote_op_list || target == NULL || action == NULL) {
2096 return FALSE;
2097 }
2098
2099 g_hash_table_iter_init(&iter, remote_op_list);
2100 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2101 if (strcmp(rop->target, target) != 0) {
2102 continue;
2103 } else if (rop->state != st_done) {
2104 continue;
2105
2106
2107
2108 } else if (strcmp(rop->action, action) != 0) {
2109 continue;
2110 } else if ((rop->completed + tolerance) < now) {
2111 continue;
2112 }
2113
2114 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2115 target, action, tolerance, rop->delegate, rop->originator);
2116 return TRUE;
2117 }
2118 return FALSE;
2119 }