This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- create_op_done_notify
- stonith_bcast_result_to_peers
- handle_local_reply_and_notify
- handle_duplicates
- remote_op_done
- remote_op_watchdog_done
- remote_op_timeout_one
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- stonith_manual_ack
- create_remote_stonith_op
- initiate_remote_stonith_op
- find_best_peer
- stonith_choose_peer
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- call_remote_stonith
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- process_remote_stonith_exec
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/msg_xml.h>
28 #include <crm/common/ipc.h>
29 #include <crm/common/ipc_internal.h>
30 #include <crm/cluster/internal.h>
31
32 #include <crm/stonith-ng.h>
33 #include <crm/fencing/internal.h>
34 #include <crm/common/xml.h>
35 #include <crm/common/xml_internal.h>
36
37 #include <crm/common/util.h>
38 #include <pacemaker-fenced.h>
39
40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
41
42
43
44
45
46
47
48 typedef struct device_properties_s {
49
50 gboolean verified;
51
52
53
54
55 gboolean executed[st_phase_max];
56
57 gboolean disallowed[st_phase_max];
58
59 int custom_action_timeout[st_phase_max];
60
61 int delay_max[st_phase_max];
62
63 int delay_base[st_phase_max];
64 } device_properties_t;
65
66 typedef struct st_query_result_s {
67
68 char *host;
69
70 gboolean tried;
71
72 int ndevices;
73
74 GHashTable *devices;
75 } st_query_result_t;
76
77 GHashTable *stonith_remote_op_list = NULL;
78
79 void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc);
80 static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup);
81 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
82 int call_options);
83
84 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
85 static int get_op_total_timeout(const remote_fencing_op_t *op,
86 const st_query_result_t *chosen_peer);
87
88 static gint
89 sort_strings(gconstpointer a, gconstpointer b)
90 {
91 return strcmp(a, b);
92 }
93
94 static void
95 free_remote_query(gpointer data)
96 {
97 if (data) {
98 st_query_result_t *query = data;
99
100 crm_trace("Free'ing query result from %s", query->host);
101 g_hash_table_destroy(query->devices);
102 free(query->host);
103 free(query);
104 }
105 }
106
107 void
108 free_stonith_remote_op_list()
109 {
110 if (stonith_remote_op_list != NULL) {
111 g_hash_table_destroy(stonith_remote_op_list);
112 stonith_remote_op_list = NULL;
113 }
114 }
115
116 struct peer_count_data {
117 const remote_fencing_op_t *op;
118 gboolean verified_only;
119 int count;
120 };
121
122
123
124
125
126
127
128
129
130 static void
131 count_peer_device(gpointer key, gpointer value, gpointer user_data)
132 {
133 device_properties_t *props = (device_properties_t*)value;
134 struct peer_count_data *data = user_data;
135
136 if (!props->executed[data->op->phase]
137 && (!data->verified_only || props->verified)) {
138 ++(data->count);
139 }
140 }
141
142
143
144
145
146
147
148
149
150
151
152 static int
153 count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer,
154 gboolean verified_only)
155 {
156 struct peer_count_data data;
157
158 data.op = op;
159 data.verified_only = verified_only;
160 data.count = 0;
161 if (peer) {
162 g_hash_table_foreach(peer->devices, count_peer_device, &data);
163 }
164 return data.count;
165 }
166
167
168
169
170
171
172
173
174
175
176
177 static device_properties_t *
178 find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer,
179 const char *device)
180 {
181 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
182
183 return (props && !props->executed[op->phase]
184 && !props->disallowed[op->phase])? props : NULL;
185 }
186
187
188
189
190
191
192
193
194
195
196
197
198 static gboolean
199 grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer,
200 const char *device, gboolean verified_devices_only)
201 {
202 device_properties_t *props = find_peer_device(op, peer, device);
203
204 if ((props == NULL) || (verified_devices_only && !props->verified)) {
205 return FALSE;
206 }
207
208 crm_trace("Removing %s from %s (%d remaining)",
209 device, peer->host, count_peer_devices(op, peer, FALSE));
210 props->executed[op->phase] = TRUE;
211 return TRUE;
212 }
213
214 static void
215 clear_remote_op_timers(remote_fencing_op_t * op)
216 {
217 if (op->query_timer) {
218 g_source_remove(op->query_timer);
219 op->query_timer = 0;
220 }
221 if (op->op_timer_total) {
222 g_source_remove(op->op_timer_total);
223 op->op_timer_total = 0;
224 }
225 if (op->op_timer_one) {
226 g_source_remove(op->op_timer_one);
227 op->op_timer_one = 0;
228 }
229 }
230
231 static void
232 free_remote_op(gpointer data)
233 {
234 remote_fencing_op_t *op = data;
235
236 crm_trace("Free'ing op %s for %s", op->id, op->target);
237 crm_log_xml_debug(op->request, "Destroying");
238
239 clear_remote_op_timers(op);
240
241 free(op->id);
242 free(op->action);
243 free(op->delegate);
244 free(op->target);
245 free(op->client_id);
246 free(op->client_name);
247 free(op->originator);
248
249 if (op->query_results) {
250 g_list_free_full(op->query_results, free_remote_query);
251 }
252 if (op->request) {
253 free_xml(op->request);
254 op->request = NULL;
255 }
256 if (op->devices_list) {
257 g_list_free_full(op->devices_list, free);
258 op->devices_list = NULL;
259 }
260 g_list_free_full(op->automatic_list, free);
261 g_list_free(op->duplicates);
262 free(op);
263 }
264
265 void
266 init_stonith_remote_op_hash_table(GHashTable **table)
267 {
268 if (*table == NULL) {
269 *table = g_hash_table_new_full(crm_str_hash, g_str_equal, NULL, free_remote_op);
270 }
271 }
272
273
274
275
276
277
278
279
280
281 static const char *
282 op_requested_action(const remote_fencing_op_t *op)
283 {
284 return ((op->phase > st_phase_requested)? "reboot" : op->action);
285 }
286
287
288
289
290
291
292
293 static void
294 op_phase_off(remote_fencing_op_t *op)
295 {
296 crm_info("Remapping multiple-device reboot targeting %s (%s) to 'off'",
297 op->target, op->id);
298 op->phase = st_phase_off;
299
300
301
302
303 strcpy(op->action, "off");
304 }
305
306
307
308
309
310
311
312 static void
313 op_phase_on(remote_fencing_op_t *op)
314 {
315 GListPtr iter = NULL;
316
317 crm_info("Remapped 'off' targeting %s complete, "
318 "remapping to 'on' for %s.%.8s",
319 op->target, op->client_name, op->id);
320 op->phase = st_phase_on;
321 strcpy(op->action, "on");
322
323
324
325
326 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
327 GListPtr match = g_list_find_custom(op->devices_list, iter->data,
328 sort_strings);
329
330 if (match) {
331 op->devices_list = g_list_remove(op->devices_list, match->data);
332 }
333 }
334 g_list_free_full(op->automatic_list, free);
335 op->automatic_list = NULL;
336
337
338 op->devices = op->devices_list;
339 }
340
341
342
343
344
345
346
347 static void
348 undo_op_remap(remote_fencing_op_t *op)
349 {
350 if (op->phase > 0) {
351 crm_info("Undoing remap of reboot targeting %s for %s.%.8s",
352 op->target, op->client_name, op->id);
353 op->phase = st_phase_requested;
354 strcpy(op->action, "reboot");
355 }
356 }
357
358 static xmlNode *
359 create_op_done_notify(remote_fencing_op_t * op, int rc)
360 {
361 xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
362
363 crm_xml_add_int(notify_data, "state", op->state);
364 crm_xml_add_int(notify_data, F_STONITH_RC, rc);
365 crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
366 crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
367 crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
368 crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
369 crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
370 crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
371 crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
372
373 return notify_data;
374 }
375
376 void
377 stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged)
378 {
379 static int count = 0;
380 xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
381 xmlNode *notify_data = create_op_done_notify(op, rc);
382
383 count++;
384 crm_trace("Broadcasting result to peers");
385 crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
386 crm_xml_add(bcast, F_SUBTYPE, "broadcast");
387 crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
388 crm_xml_add_int(bcast, "count", count);
389
390 if (op_merged) {
391 crm_xml_add(bcast, F_STONITH_MERGED, "true");
392 }
393
394 add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
395 send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
396 free_xml(notify_data);
397 free_xml(bcast);
398
399 return;
400 }
401
402 static void
403 handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc)
404 {
405 xmlNode *notify_data = NULL;
406 xmlNode *reply = NULL;
407
408 if (op->notify_sent == TRUE) {
409
410 return;
411 }
412
413
414 notify_data = create_op_done_notify(op, rc);
415 crm_xml_add_int(data, "state", op->state);
416 crm_xml_add(data, F_STONITH_TARGET, op->target);
417 crm_xml_add(data, F_STONITH_OPERATION, op->action);
418
419 reply = stonith_construct_reply(op->request, NULL, data, rc);
420 crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
421
422
423 do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE);
424
425
426 do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data);
427 do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
428
429
430 op->notify_sent = TRUE;
431 free_xml(reply);
432 free_xml(notify_data);
433 }
434
435 static void
436 handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc)
437 {
438 GListPtr iter = NULL;
439
440 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
441 remote_fencing_op_t *other = iter->data;
442
443 if (other->state == st_duplicate) {
444 other->state = op->state;
445 crm_debug("Performing duplicate notification for %s@%s.%.8s = %s",
446 other->client_name, other->originator, other->id,
447 pcmk_strerror(rc));
448 remote_op_done(other, data, rc, TRUE);
449
450 } else {
451
452 crm_err("Skipping duplicate notification for %s@%s - %d", other->client_name,
453 other->originator, other->state);
454 }
455 }
456 }
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483 static void
484 remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
485 {
486 int level = LOG_ERR;
487 const char *subt = NULL;
488 xmlNode *local_data = NULL;
489 gboolean op_merged = FALSE;
490
491 op->completed = time(NULL);
492 clear_remote_op_timers(op);
493 undo_op_remap(op);
494
495 if (op->notify_sent == TRUE) {
496 crm_err("Already sent notifications for '%s' targeting %s on %s for "
497 "client %s@%s.%.8s: %s " CRM_XS " rc=%d state=%d",
498 op->action, op->target,
499 (op->delegate? op->delegate : "unknown node"),
500 op->client_name, op->originator, op->id, pcmk_strerror(rc),
501 rc, op->state);
502 goto remote_op_done_cleanup;
503 }
504
505 if (!op->delegate && data && rc != -ENODEV && rc != -EHOSTUNREACH) {
506 xmlNode *ndata = get_xpath_object("//@" F_STONITH_DELEGATE, data,
507 LOG_NEVER);
508 if(ndata) {
509 op->delegate = crm_element_value_copy(ndata, F_STONITH_DELEGATE);
510 } else {
511 op->delegate = crm_element_value_copy(data, F_ORIG);
512 }
513 }
514
515 if (data == NULL) {
516 data = create_xml_node(NULL, "remote-op");
517 local_data = data;
518 }
519
520 if(dup) {
521 op_merged = TRUE;
522 } else if (crm_element_value(data, F_STONITH_MERGED)) {
523 op_merged = TRUE;
524 }
525
526
527
528
529 subt = crm_element_value(data, F_SUBTYPE);
530 if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
531
532 stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE));
533 goto remote_op_done_cleanup;
534 }
535
536 if (rc == pcmk_ok || dup) {
537 level = LOG_NOTICE;
538 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
539 level = LOG_NOTICE;
540 }
541
542 do_crm_log(level, "Operation '%s'%s%s on %s for %s@%s.%.8s%s: %s",
543 op->action, (op->target? " targeting " : ""),
544 (op->target? op->target : ""),
545 (op->delegate? op->delegate : "<no-one>"),
546 op->client_name, op->originator, op->id,
547 (op_merged? " (merged)" : ""), pcmk_strerror(rc));
548
549 handle_local_reply_and_notify(op, data, rc);
550
551 if (dup == FALSE) {
552 handle_duplicates(op, data, rc);
553 }
554
555
556
557
558 if (op->query_results) {
559 g_list_free_full(op->query_results, free_remote_query);
560 op->query_results = NULL;
561 }
562
563 if (op->request) {
564 free_xml(op->request);
565 op->request = NULL;
566 }
567
568 remote_op_done_cleanup:
569 free_xml(local_data);
570 }
571
572 static gboolean
573 remote_op_watchdog_done(gpointer userdata)
574 {
575 remote_fencing_op_t *op = userdata;
576
577 op->op_timer_one = 0;
578
579 crm_notice("Self-fencing (%s) by %s for %s.%8s assumed complete",
580 op->action, op->target, op->client_name, op->id);
581 op->state = st_done;
582 remote_op_done(op, NULL, pcmk_ok, FALSE);
583 return FALSE;
584 }
585
586 static gboolean
587 remote_op_timeout_one(gpointer userdata)
588 {
589 remote_fencing_op_t *op = userdata;
590
591 op->op_timer_one = 0;
592
593 crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
594 " id=%s", op->action, op->target, op->client_name, op->id);
595 call_remote_stonith(op, NULL, pcmk_ok);
596 return FALSE;
597 }
598
599 static gboolean
600 remote_op_timeout(gpointer userdata)
601 {
602 remote_fencing_op_t *op = userdata;
603
604 op->op_timer_total = 0;
605
606 if (op->state == st_done) {
607 crm_debug("Action '%s' targeting %s for client %s already completed "
608 CRM_XS " id=%s",
609 op->action, op->target, op->client_name, op->id);
610 return FALSE;
611 }
612
613 crm_debug("Action '%s' targeting %s for client %s timed out "
614 CRM_XS " id=%s",
615 op->action, op->target, op->client_name, op->id);
616
617 if (op->phase == st_phase_on) {
618
619
620
621
622 remote_op_done(op, NULL, pcmk_ok, FALSE);
623 return FALSE;
624 }
625
626 op->state = st_failed;
627
628 remote_op_done(op, NULL, -ETIME, FALSE);
629
630 return FALSE;
631 }
632
633 static gboolean
634 remote_op_query_timeout(gpointer data)
635 {
636 remote_fencing_op_t *op = data;
637
638 op->query_timer = 0;
639 if (op->state == st_done) {
640 crm_debug("Operation %s targeting %s already completed",
641 op->id, op->target);
642 } else if (op->state == st_exec) {
643 crm_debug("Operation %s targeting %s already in progress",
644 op->id, op->target);
645 } else if (op->query_results) {
646 crm_debug("Query %s targeting %s complete (state=%d)",
647 op->id, op->target, op->state);
648 call_remote_stonith(op, NULL, pcmk_ok);
649 } else {
650 crm_debug("Query %s targeting %s timed out (state=%d)",
651 op->id, op->target, op->state);
652 if (op->op_timer_total) {
653 g_source_remove(op->op_timer_total);
654 op->op_timer_total = 0;
655 }
656 remote_op_timeout(op);
657 }
658
659 return FALSE;
660 }
661
662 static gboolean
663 topology_is_empty(stonith_topology_t *tp)
664 {
665 int i;
666
667 if (tp == NULL) {
668 return TRUE;
669 }
670
671 for (i = 0; i < ST_LEVEL_MAX; i++) {
672 if (tp->levels[i] != NULL) {
673 return FALSE;
674 }
675 }
676 return TRUE;
677 }
678
679
680
681
682
683
684
685
686 static void
687 add_required_device(remote_fencing_op_t *op, const char *device)
688 {
689 GListPtr match = g_list_find_custom(op->automatic_list, device,
690 sort_strings);
691
692 if (!match) {
693 op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
694 }
695 }
696
697
698
699
700
701
702
703
704 static void
705 remove_required_device(remote_fencing_op_t *op, const char *device)
706 {
707 GListPtr match = g_list_find_custom(op->automatic_list, device,
708 sort_strings);
709
710 if (match) {
711 op->automatic_list = g_list_remove(op->automatic_list, match->data);
712 }
713 }
714
715
716 static void
717 set_op_device_list(remote_fencing_op_t * op, GListPtr devices)
718 {
719 GListPtr lpc = NULL;
720
721 if (op->devices_list) {
722 g_list_free_full(op->devices_list, free);
723 op->devices_list = NULL;
724 }
725 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
726 op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
727 }
728 op->devices = op->devices_list;
729 }
730
731
732
733
734
735
736
737
738
739
740 static gboolean
741 topology_matches(const stonith_topology_t *tp, const char *node)
742 {
743 regex_t r_patt;
744
745 CRM_CHECK(node && tp && tp->target, return FALSE);
746 switch(tp->kind) {
747 case 2:
748
749
750
751
752
753
754 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
755 crm_notice("Matched %s with %s by attribute", node, tp->target);
756 return TRUE;
757 }
758 break;
759 case 1:
760
761
762
763
764 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
765 crm_info("Bad regex '%s' for fencing level", tp->target);
766 } else {
767 int status = regexec(&r_patt, node, 0, NULL, 0);
768
769 regfree(&r_patt);
770 if (status == 0) {
771 crm_notice("Matched %s with %s by name", node, tp->target);
772 return TRUE;
773 }
774 }
775 break;
776 case 0:
777 crm_trace("Testing %s against %s", node, tp->target);
778 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
779 }
780 crm_trace("No match for %s with %s", node, tp->target);
781 return FALSE;
782 }
783
784 stonith_topology_t *
785 find_topology_for_host(const char *host)
786 {
787 GHashTableIter tIter;
788 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
789
790 if(tp != NULL) {
791 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
792 return tp;
793 }
794
795 g_hash_table_iter_init(&tIter, topology);
796 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
797 if (topology_matches(tp, host)) {
798 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
799 return tp;
800 }
801 }
802
803 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
804 return NULL;
805 }
806
807
808
809
810
811
812
813
814
815
816
817
818 static int
819 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
820 {
821 stonith_topology_t *tp = NULL;
822
823 if (op->target) {
824 tp = find_topology_for_host(op->target);
825 }
826 if (topology_is_empty(tp)) {
827 return empty_ok? pcmk_rc_ok : ENODEV;
828 }
829
830 CRM_ASSERT(tp->levels != NULL);
831
832 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
833
834
835 undo_op_remap(op);
836
837 do {
838 op->level++;
839
840 } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
841
842 if (op->level < ST_LEVEL_MAX) {
843 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
844 "for client %s@%s.%.8s",
845 op->level, op->target, g_list_length(tp->levels[op->level]),
846 op->client_name, op->originator, op->id);
847 set_op_device_list(op, tp->levels[op->level]);
848
849
850 if (op->level > 1 && op->delay > 0) {
851 op->delay = 0;
852 }
853
854 if (g_list_next(op->devices_list) && pcmk__str_eq(op->action, "reboot", pcmk__str_casei)) {
855
856
857
858
859
860 op_phase_off(op);
861 }
862 return pcmk_rc_ok;
863 }
864
865 crm_notice("All fencing options targeting %s for client %s@%s.%.8s failed",
866 op->target, op->client_name, op->originator, op->id);
867 return ENODEV;
868 }
869
870
871
872
873
874
875 static void
876 merge_duplicates(remote_fencing_op_t * op)
877 {
878 GHashTableIter iter;
879 remote_fencing_op_t *other = NULL;
880
881 time_t now = time(NULL);
882
883 g_hash_table_iter_init(&iter, stonith_remote_op_list);
884 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
885 crm_node_t *peer = NULL;
886 const char *other_action = op_requested_action(other);
887
888 if (other->state > st_exec) {
889
890 continue;
891 } else if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
892
893 continue;
894 } else if (!pcmk__str_eq(op->action, other_action, pcmk__str_casei)) {
895 crm_trace("Must be for the same action: %s vs. %s",
896 op->action, other_action);
897 continue;
898 } else if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
899 crm_trace("Must be for different clients: %s", op->client_name);
900 continue;
901 } else if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
902 crm_trace("Can't be a suicide operation: %s", other->target);
903 continue;
904 }
905
906 peer = crm_get_peer(0, other->originator);
907 if(fencing_peer_active(peer) == FALSE) {
908 crm_notice("Failing action '%s' targeting %s originating from "
909 "client %s@%s.%.8s: Originator is dead",
910 other->action, other->target, other->client_name, other->originator, other->id);
911 other->state = st_failed;
912 continue;
913
914 } else if(other->total_timeout > 0 && now > (other->total_timeout + other->created)) {
915 crm_info("Action '%s' targeting %s originating from client "
916 "%s@%s.%.8s is too old: %ld vs. %ld + %d",
917 other->action, other->target, other->client_name, other->originator, other->id,
918 now, other->created, other->total_timeout);
919 continue;
920 }
921
922
923
924
925 other->duplicates = g_list_append(other->duplicates, op);
926 if (other->total_timeout == 0) {
927 crm_trace("Making a best-guess as to the timeout used");
928 other->total_timeout = op->total_timeout =
929 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
930 }
931 crm_notice("Merging stonith action '%s' targeting %s originating from "
932 "client %s.%.8s with identical request from %s@%s.%.8s (%ds)",
933 op->action, op->target, op->client_name, op->id,
934 other->client_name, other->originator, other->id,
935 other->total_timeout);
936 report_timeout_period(op, other->total_timeout);
937 op->state = st_duplicate;
938 }
939 }
940
941 static uint32_t fencing_active_peers(void)
942 {
943 uint32_t count = 0;
944 crm_node_t *entry;
945 GHashTableIter gIter;
946
947 g_hash_table_iter_init(&gIter, crm_peer_cache);
948 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
949 if(fencing_peer_active(entry)) {
950 count++;
951 }
952 }
953 return count;
954 }
955
956 int
957 stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op)
958 {
959 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
960
961 op->state = st_done;
962 op->completed = time(NULL);
963 op->delegate = strdup("a human");
964
965 crm_notice("Injecting manual confirmation that %s is safely off/down",
966 crm_element_value(dev, F_STONITH_TARGET));
967
968 remote_op_done(op, msg, pcmk_ok, FALSE);
969
970
971 return -EINPROGRESS;
972 }
973
974
975
976
977
978
979
980
981
982
983
984
985 void *
986 create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer)
987 {
988 remote_fencing_op_t *op = NULL;
989 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
990 int call_options = 0;
991 const char *operation = NULL;
992
993 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
994
995
996
997 if (peer && dev) {
998 const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
999
1000 CRM_CHECK(op_id != NULL, return NULL);
1001
1002 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1003 if (op) {
1004 crm_debug("%s already exists", op_id);
1005 return op;
1006 }
1007 }
1008
1009 op = calloc(1, sizeof(remote_fencing_op_t));
1010
1011 crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1012
1013 crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
1014
1015 if (peer && dev) {
1016 op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1017 } else {
1018 op->id = crm_generate_uuid();
1019 }
1020
1021 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1022 CRM_LOG_ASSERT(g_hash_table_lookup(stonith_remote_op_list, op->id) != NULL);
1023 crm_trace("Created %s", op->id);
1024
1025 op->state = st_query;
1026 op->replies_expected = fencing_active_peers();
1027 op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1028 op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1029 op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE);
1030 op->created = time(NULL);
1031
1032 if (op->originator == NULL) {
1033
1034 op->originator = strdup(stonith_our_uname);
1035 }
1036
1037 CRM_LOG_ASSERT(client != NULL);
1038 if (client) {
1039 op->client_id = strdup(client);
1040 }
1041
1042
1043
1044 operation = crm_element_value(request, F_STONITH_OPERATION);
1045
1046 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1047 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1048 (unsigned long) getpid());
1049 } else {
1050 op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1051 }
1052
1053 op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1054 op->request = copy_xml(request);
1055 crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1056 op->call_options = call_options;
1057
1058 crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1059
1060 crm_trace("%s new stonith op %s ('%s' targeting %s for client %s)",
1061 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1062 op->target, op->client_name);
1063
1064 if (op->call_options & st_opt_cs_nodeid) {
1065 int nodeid = crm_atoi(op->target, NULL);
1066 crm_node_t *node = crm_find_known_peer_full(nodeid, NULL, CRM_GET_PEER_ANY);
1067
1068
1069 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1070
1071 if (node && node->uname) {
1072 free(op->target);
1073 op->target = strdup(node->uname);
1074
1075 } else {
1076 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1077 }
1078 }
1079
1080
1081 merge_duplicates(op);
1082
1083 if (op->state != st_duplicate) {
1084
1085 do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
1086 }
1087
1088
1089 stonith_fence_history_trim();
1090
1091 return op;
1092 }
1093
1094 remote_fencing_op_t *
1095 initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request,
1096 gboolean manual_ack)
1097 {
1098 int query_timeout = 0;
1099 xmlNode *query = NULL;
1100 const char *client_id = NULL;
1101 remote_fencing_op_t *op = NULL;
1102 const char *relay_op_id = NULL;
1103 const char *operation = NULL;
1104
1105 if (client) {
1106 client_id = client->id;
1107 } else {
1108 client_id = crm_element_value(request, F_STONITH_CLIENTID);
1109 }
1110
1111 CRM_LOG_ASSERT(client_id != NULL);
1112 op = create_remote_stonith_op(client_id, request, FALSE);
1113 op->owner = TRUE;
1114 if (manual_ack) {
1115 crm_notice("Initiating manual confirmation for %s: %s",
1116 op->target, op->id);
1117 return op;
1118 }
1119
1120 CRM_CHECK(op->action, return NULL);
1121
1122 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1123 op->state = st_failed;
1124 }
1125
1126 switch (op->state) {
1127 case st_failed:
1128 crm_warn("Could not request peer fencing (%s) targeting %s "
1129 CRM_XS " id=%s", op->action, op->target, op->id);
1130 remote_op_done(op, NULL, -EINVAL, FALSE);
1131 return op;
1132
1133 case st_duplicate:
1134 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1135 CRM_XS " id=%s", op->action, op->target, op->id);
1136 return op;
1137
1138 default:
1139 crm_notice("Requesting peer fencing (%s) targeting %s "
1140 CRM_XS " id=%s state=%d",
1141 op->action, op->target, op->id, op->state);
1142 }
1143
1144 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1145 NULL, op->call_options);
1146
1147 crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1148 crm_xml_add(query, F_STONITH_TARGET, op->target);
1149 crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1150 crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1151 crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1152 crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1153 crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1154
1155
1156 operation = crm_element_value(request, F_STONITH_OPERATION);
1157 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1158 relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1159 if (relay_op_id) {
1160 crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1161 }
1162 }
1163
1164 send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1165 free_xml(query);
1166
1167 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1168 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1169
1170 return op;
1171 }
1172
1173 enum find_best_peer_options {
1174
1175 FIND_PEER_SKIP_TARGET = 0x0001,
1176
1177 FIND_PEER_TARGET_ONLY = 0x0002,
1178
1179 FIND_PEER_VERIFIED_ONLY = 0x0004,
1180 };
1181
1182 static st_query_result_t *
1183 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1184 {
1185 GListPtr iter = NULL;
1186 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1187
1188 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1189 return NULL;
1190 }
1191
1192 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1193 st_query_result_t *peer = iter->data;
1194
1195 crm_trace("Testing result from %s targeting %s with %d devices: %d %x",
1196 peer->host, op->target, peer->ndevices, peer->tried, options);
1197 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1198 continue;
1199 }
1200 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1201 continue;
1202 }
1203
1204 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1205
1206 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1207 return peer;
1208 }
1209
1210 } else if ((peer->tried == FALSE)
1211 && count_peer_devices(op, peer, verified_devices_only)) {
1212
1213
1214 crm_trace("Simple fencing");
1215 return peer;
1216 }
1217 }
1218
1219 return NULL;
1220 }
1221
1222 static st_query_result_t *
1223 stonith_choose_peer(remote_fencing_op_t * op)
1224 {
1225 const char *device = NULL;
1226 st_query_result_t *peer = NULL;
1227 uint32_t active = fencing_active_peers();
1228
1229 do {
1230 if (op->devices) {
1231 device = op->devices->data;
1232 crm_trace("Checking for someone to fence (%s) %s with %s",
1233 op->action, op->target, device);
1234 } else {
1235 crm_trace("Checking for someone to fence (%s) %s",
1236 op->action, op->target);
1237 }
1238
1239
1240 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1241 if (peer) {
1242 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1243 return peer;
1244 }
1245
1246 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1247 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1248 return NULL;
1249 }
1250
1251
1252 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1253 if (peer) {
1254 crm_trace("Found best unverified peer %s", peer->host);
1255 return peer;
1256 }
1257
1258
1259
1260
1261 if (op->phase != st_phase_on) {
1262 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1263 if (peer) {
1264 crm_trace("%s will fence itself", peer->host);
1265 return peer;
1266 }
1267 }
1268
1269
1270
1271
1272 } while ((op->phase != st_phase_on)
1273 && pcmk_is_set(op->call_options, st_opt_topology)
1274 && (advance_topology_level(op, false) == pcmk_rc_ok));
1275
1276 crm_notice("Couldn't find anyone to fence (%s) %s with %s",
1277 op->action, op->target, (device? device : "any device"));
1278 return NULL;
1279 }
1280
1281 static int
1282 get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer,
1283 const char *device)
1284 {
1285 device_properties_t *props;
1286
1287 if (!peer || !device) {
1288 return op->base_timeout;
1289 }
1290
1291 props = g_hash_table_lookup(peer->devices, device);
1292 if (!props) {
1293 return op->base_timeout;
1294 }
1295
1296 return (props->custom_action_timeout[op->phase]?
1297 props->custom_action_timeout[op->phase] : op->base_timeout)
1298 + props->delay_max[op->phase];
1299 }
1300
1301 struct timeout_data {
1302 const remote_fencing_op_t *op;
1303 const st_query_result_t *peer;
1304 int total_timeout;
1305 };
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315 static void
1316 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1317 {
1318 const char *device_id = key;
1319 device_properties_t *props = value;
1320 struct timeout_data *timeout = user_data;
1321
1322 if (!props->executed[timeout->op->phase]
1323 && !props->disallowed[timeout->op->phase]) {
1324 timeout->total_timeout += get_device_timeout(timeout->op,
1325 timeout->peer, device_id);
1326 }
1327 }
1328
1329 static int
1330 get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer)
1331 {
1332 struct timeout_data timeout;
1333
1334 timeout.op = op;
1335 timeout.peer = peer;
1336 timeout.total_timeout = 0;
1337
1338 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1339
1340 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1341 }
1342
1343 static int
1344 get_op_total_timeout(const remote_fencing_op_t *op,
1345 const st_query_result_t *chosen_peer)
1346 {
1347 int total_timeout = 0;
1348 stonith_topology_t *tp = find_topology_for_host(op->target);
1349
1350 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1351 int i;
1352 GListPtr device_list = NULL;
1353 GListPtr iter = NULL;
1354
1355
1356
1357
1358
1359
1360
1361
1362 for (i = 0; i < ST_LEVEL_MAX; i++) {
1363 if (!tp->levels[i]) {
1364 continue;
1365 }
1366 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1367 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1368 const st_query_result_t *peer = iter->data;
1369
1370 if (find_peer_device(op, peer, device_list->data)) {
1371 total_timeout += get_device_timeout(op, peer,
1372 device_list->data);
1373 break;
1374 }
1375 }
1376 }
1377 }
1378
1379 } else if (chosen_peer) {
1380 total_timeout = get_peer_timeout(op, chosen_peer);
1381 } else {
1382 total_timeout = op->base_timeout;
1383 }
1384
1385 return total_timeout ? total_timeout : op->base_timeout;
1386 }
1387
1388 static void
1389 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1390 {
1391 GListPtr iter = NULL;
1392 xmlNode *update = NULL;
1393 const char *client_node = NULL;
1394 const char *client_id = NULL;
1395 const char *call_id = NULL;
1396
1397 if (op->call_options & st_opt_sync_call) {
1398
1399
1400
1401
1402 return;
1403 } else if (!op->request) {
1404 return;
1405 }
1406
1407 crm_trace("Reporting timeout for %s.%.8s", op->client_name, op->id);
1408 client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1409 call_id = crm_element_value(op->request, F_STONITH_CALLID);
1410 client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1411 if (!client_node || !call_id || !client_id) {
1412 return;
1413 }
1414
1415 if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1416
1417 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1418 return;
1419 }
1420
1421
1422 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1423 crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1424 crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1425 crm_xml_add(update, F_STONITH_CALLID, call_id);
1426 crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1427
1428 send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1429
1430 free_xml(update);
1431
1432 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1433 remote_fencing_op_t *dup = iter->data;
1434
1435 crm_trace("Reporting timeout for duplicate %s.%.8s", dup->client_name, dup->id);
1436 report_timeout_period(iter->data, op_timeout);
1437 }
1438 }
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449 static void
1450 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1451 xmlNode *msg, int rc)
1452 {
1453
1454 if (op->devices) {
1455 op->devices = op->devices->next;
1456 }
1457
1458
1459 if ((op->phase == st_phase_requested) && pcmk__str_eq(op->action, "on", pcmk__str_casei)) {
1460
1461 remove_required_device(op, device);
1462
1463
1464
1465
1466 if (op->devices == NULL) {
1467 op->devices = op->automatic_list;
1468 }
1469 }
1470
1471 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1472
1473
1474
1475
1476 op_phase_on(op);
1477 }
1478
1479 if (op->devices) {
1480
1481 crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)",
1482 op->target, op->originator, op->client_name, rc);
1483
1484
1485 if (op->delay > 0) {
1486 op->delay = 0;
1487 }
1488
1489 call_remote_stonith(op, NULL, pcmk_ok);
1490 } else {
1491
1492 crm_trace("Marking complex fencing op targeting %s as complete",
1493 op->target);
1494 op->state = st_done;
1495 remote_op_done(op, msg, rc, FALSE);
1496 }
1497 }
1498
1499 void
1500 call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc)
1501 {
1502 const char *device = NULL;
1503 int timeout = op->base_timeout;
1504
1505 crm_trace("State for %s.%.8s: %s %d", op->target, op->client_name, op->id, op->state);
1506 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1507 peer = stonith_choose_peer(op);
1508 }
1509
1510 if (!op->op_timer_total) {
1511 int total_timeout = get_op_total_timeout(op, peer);
1512
1513 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
1514 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1515 report_timeout_period(op, op->total_timeout);
1516 crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1517 CRM_XS "id=%s",
1518 total_timeout, op->target, op->client_name, op->id);
1519 }
1520
1521 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1522
1523
1524
1525 peer = stonith_choose_peer(op);
1526
1527 device = op->devices->data;
1528 timeout = get_device_timeout(op, peer, device);
1529 }
1530
1531 if (peer) {
1532 int timeout_one = 0;
1533 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1534
1535 crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1536 crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1537 crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1538 crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1539 crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1540 crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1541 crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1542 crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1543 crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
1544
1545 if (device) {
1546 timeout_one = TIMEOUT_MULTIPLY_FACTOR *
1547 get_device_timeout(op, peer, device);
1548 crm_notice("Requesting that %s perform '%s' action targeting %s "
1549 "using '%s' " CRM_XS " for client %s (%ds)",
1550 peer->host, op->action, op->target, device,
1551 op->client_name, timeout_one);
1552 crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1553 crm_xml_add(remote_op, F_STONITH_MODE, "slave");
1554
1555 } else {
1556 timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1557 crm_notice("Requesting that %s perform '%s' action targeting %s "
1558 CRM_XS " for client %s (%ds, %lds)",
1559 peer->host, op->action, op->target, op->client_name,
1560 timeout_one, stonith_watchdog_timeout_ms);
1561 crm_xml_add(remote_op, F_STONITH_MODE, "smart");
1562 }
1563
1564 op->state = st_exec;
1565 if (op->op_timer_one) {
1566 g_source_remove(op->op_timer_one);
1567 }
1568
1569 if(stonith_watchdog_timeout_ms > 0 && device && pcmk__str_eq(device, "watchdog", pcmk__str_casei)) {
1570 crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s",
1571 stonith_watchdog_timeout_ms/1000, op->target, op->action,
1572 op->client_name, op->id);
1573 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1574
1575
1576 } else if(stonith_watchdog_timeout_ms > 0
1577 && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1578 && !pcmk__str_eq(op->action, "on", pcmk__str_casei)) {
1579 crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s",
1580 stonith_watchdog_timeout_ms/1000, op->target, op->action,
1581 op->client_name, op->id);
1582 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1583
1584 } else {
1585 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1586 }
1587
1588
1589 send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1590 peer->tried = TRUE;
1591 free_xml(remote_op);
1592 return;
1593
1594 } else if (op->phase == st_phase_on) {
1595
1596
1597
1598 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1599 "after successful 'off'", device, op->target);
1600 advance_topology_device_in_level(op, device, NULL, pcmk_ok);
1601 return;
1602
1603 } else if (op->owner == FALSE) {
1604 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1605 op->action, op->target, op->client_name);
1606
1607 } else if (op->query_timer == 0) {
1608
1609 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1610 CRM_XS " state=%d",
1611 op->action, op->target, op->client_name, op->state);
1612 CRM_LOG_ASSERT(op->state < st_done);
1613 remote_op_timeout(op);
1614
1615 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1616
1617
1618
1619
1620
1621
1622 if(stonith_watchdog_timeout_ms && pcmk__str_eq(device, "watchdog", pcmk__str_null_matches | pcmk__str_casei)) {
1623 crm_notice("Waiting %lds for %s to self-fence (%s) for client %s.%.8s",
1624 stonith_watchdog_timeout_ms/1000, op->target,
1625 op->action, op->client_name, op->id);
1626
1627 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1628 return;
1629 }
1630
1631 if (op->state == st_query) {
1632 crm_info("No peers (out of %d) have devices capable of fencing "
1633 "(%s) %s for client %s " CRM_XS " state=%d",
1634 op->replies, op->action, op->target, op->client_name,
1635 op->state);
1636
1637 rc = -ENODEV;
1638 } else {
1639 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1640 rc = -EHOSTUNREACH;
1641 }
1642
1643 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1644 "for client %s " CRM_XS " state=%d",
1645 op->replies, op->action, op->target, op->client_name,
1646 op->state);
1647 }
1648
1649 op->state = st_failed;
1650 remote_op_done(op, NULL, rc, FALSE);
1651
1652 } else {
1653 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1654 "for client %s%.8s",
1655 op->action, op->target, (device? " with " : ""),
1656 (device? device : ""), op->client_name, op->id);
1657 }
1658 }
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671 static gint
1672 sort_peers(gconstpointer a, gconstpointer b)
1673 {
1674 const st_query_result_t *peer_a = a;
1675 const st_query_result_t *peer_b = b;
1676
1677 return (peer_b->ndevices - peer_a->ndevices);
1678 }
1679
1680
1681
1682
1683
1684 static gboolean
1685 all_topology_devices_found(remote_fencing_op_t * op)
1686 {
1687 GListPtr device = NULL;
1688 GListPtr iter = NULL;
1689 device_properties_t *match = NULL;
1690 stonith_topology_t *tp = NULL;
1691 gboolean skip_target = FALSE;
1692 int i;
1693
1694 tp = find_topology_for_host(op->target);
1695 if (!tp) {
1696 return FALSE;
1697 }
1698 if (pcmk__strcase_any_of(op->action, "off", "reboot", NULL)) {
1699
1700
1701 skip_target = TRUE;
1702 }
1703
1704 for (i = 0; i < ST_LEVEL_MAX; i++) {
1705 for (device = tp->levels[i]; device; device = device->next) {
1706 match = NULL;
1707 for (iter = op->query_results; iter && !match; iter = iter->next) {
1708 st_query_result_t *peer = iter->data;
1709
1710 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1711 continue;
1712 }
1713 match = find_peer_device(op, peer, device->data);
1714 }
1715 if (!match) {
1716 return FALSE;
1717 }
1718 }
1719 }
1720
1721 return TRUE;
1722 }
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735 static void
1736 parse_action_specific(xmlNode *xml, const char *peer, const char *device,
1737 const char *action, remote_fencing_op_t *op,
1738 enum st_remap_phase phase, device_properties_t *props)
1739 {
1740 props->custom_action_timeout[phase] = 0;
1741 crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
1742 &props->custom_action_timeout[phase]);
1743 if (props->custom_action_timeout[phase]) {
1744 crm_trace("Peer %s with device %s returned %s action timeout %d",
1745 peer, device, action, props->custom_action_timeout[phase]);
1746 }
1747
1748 props->delay_max[phase] = 0;
1749 crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
1750 if (props->delay_max[phase]) {
1751 crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
1752 peer, device, props->delay_max[phase], action);
1753 }
1754
1755 props->delay_base[phase] = 0;
1756 crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
1757 if (props->delay_base[phase]) {
1758 crm_trace("Peer %s with device %s returned base delay %d for %s",
1759 peer, device, props->delay_base[phase], action);
1760 }
1761
1762
1763 if (pcmk__str_eq(action, "on", pcmk__str_casei)) {
1764 int required = 0;
1765
1766 crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
1767 if (required) {
1768 crm_trace("Peer %s requires device %s to execute for action %s",
1769 peer, device, action);
1770 add_required_device(op, device);
1771 }
1772 }
1773
1774
1775
1776
1777 if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) {
1778 props->disallowed[phase] = TRUE;
1779 crm_trace("Peer %s is disallowed from executing %s for device %s",
1780 peer, action, device);
1781 }
1782 }
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793 static void
1794 add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
1795 st_query_result_t *result, const char *device)
1796 {
1797 xmlNode *child;
1798 int verified = 0;
1799 device_properties_t *props = calloc(1, sizeof(device_properties_t));
1800
1801
1802 CRM_ASSERT(props != NULL);
1803 g_hash_table_insert(result->devices, strdup(device), props);
1804
1805
1806 crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
1807 if (verified) {
1808 crm_trace("Peer %s has confirmed a verified device %s",
1809 result->host, device);
1810 props->verified = TRUE;
1811 }
1812
1813
1814 parse_action_specific(xml, result->host, device, op_requested_action(op),
1815 op, st_phase_requested, props);
1816 for (child = pcmk__xml_first_child(xml); child != NULL;
1817 child = pcmk__xml_next(child)) {
1818
1819
1820
1821
1822 if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) {
1823 parse_action_specific(child, result->host, device, "off",
1824 op, st_phase_off, props);
1825 } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) {
1826 parse_action_specific(child, result->host, device, "on",
1827 op, st_phase_on, props);
1828 }
1829 }
1830 }
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843 static st_query_result_t *
1844 add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
1845 {
1846 st_query_result_t *result = calloc(1, sizeof(st_query_result_t));
1847 xmlNode *child;
1848
1849
1850
1851 CRM_CHECK(result != NULL, return NULL);
1852 result->host = strdup(host);
1853 result->devices = crm_str_table_new();
1854
1855
1856 for (child = pcmk__xml_first_child(xml); child != NULL;
1857 child = pcmk__xml_next(child)) {
1858 const char *device = ID(child);
1859
1860 if (device) {
1861 add_device_properties(child, op, result, device);
1862 }
1863 }
1864
1865 result->ndevices = g_hash_table_size(result->devices);
1866 CRM_CHECK(ndevices == result->ndevices,
1867 crm_err("Query claimed to have %d devices but %d found",
1868 ndevices, result->ndevices));
1869
1870 op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
1871 return result;
1872 }
1873
1874
1875
1876
1877
1878
1879
1880
1881
1882
1883
1884
1885
1886
1887
1888 int
1889 process_remote_stonith_query(xmlNode * msg)
1890 {
1891 int ndevices = 0;
1892 gboolean host_is_target = FALSE;
1893 gboolean have_all_replies = FALSE;
1894 const char *id = NULL;
1895 const char *host = NULL;
1896 remote_fencing_op_t *op = NULL;
1897 st_query_result_t *result = NULL;
1898 uint32_t replies_expected;
1899 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
1900
1901 CRM_CHECK(dev != NULL, return -EPROTO);
1902
1903 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1904 CRM_CHECK(id != NULL, return -EPROTO);
1905
1906 dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
1907 CRM_CHECK(dev != NULL, return -EPROTO);
1908 crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
1909
1910 op = g_hash_table_lookup(stonith_remote_op_list, id);
1911 if (op == NULL) {
1912 crm_debug("Received query reply for unknown or expired operation %s",
1913 id);
1914 return -EOPNOTSUPP;
1915 }
1916
1917 replies_expected = fencing_active_peers();
1918 if (op->replies_expected < replies_expected) {
1919 replies_expected = op->replies_expected;
1920 }
1921 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
1922 have_all_replies = TRUE;
1923 }
1924 host = crm_element_value(msg, F_ORIG);
1925 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
1926
1927 crm_info("Query result %d of %d from %s for %s/%s (%d devices) %s",
1928 op->replies, replies_expected, host,
1929 op->target, op->action, ndevices, id);
1930 if (ndevices > 0) {
1931 result = add_result(op, host, ndevices, dev);
1932 }
1933
1934 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1935
1936
1937
1938 if (op->state == st_query && all_topology_devices_found(op)) {
1939
1940 crm_trace("All topology devices found");
1941 call_remote_stonith(op, result, pcmk_ok);
1942
1943 } else if (have_all_replies) {
1944 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
1945 replies_expected, op->replies);
1946 call_remote_stonith(op, NULL, pcmk_ok);
1947 }
1948
1949 } else if (op->state == st_query) {
1950 int nverified = count_peer_devices(op, result, TRUE);
1951
1952
1953
1954 if (result && (host_is_target == FALSE) && nverified) {
1955
1956 crm_trace("Found %d verified devices", nverified);
1957 call_remote_stonith(op, result, pcmk_ok);
1958
1959 } else if (have_all_replies) {
1960 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
1961 replies_expected, op->replies);
1962 call_remote_stonith(op, NULL, pcmk_ok);
1963
1964 } else {
1965 crm_trace("Waiting for more peer results before launching fencing operation");
1966 }
1967
1968 } else if (result && (op->state == st_done)) {
1969 crm_info("Discarding query result from %s (%d devices): Operation is in state %d",
1970 result->host, result->ndevices, op->state);
1971 }
1972
1973 return pcmk_ok;
1974 }
1975
1976
1977
1978
1979
1980
1981
1982
1983
1984
1985
1986
1987 int
1988 process_remote_stonith_exec(xmlNode * msg)
1989 {
1990 int rc = 0;
1991 const char *id = NULL;
1992 const char *device = NULL;
1993 remote_fencing_op_t *op = NULL;
1994 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
1995
1996 CRM_CHECK(dev != NULL, return -EPROTO);
1997
1998 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1999 CRM_CHECK(id != NULL, return -EPROTO);
2000
2001 dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
2002 CRM_CHECK(dev != NULL, return -EPROTO);
2003
2004 crm_element_value_int(dev, F_STONITH_RC, &rc);
2005
2006 device = crm_element_value(dev, F_STONITH_DEVICE);
2007
2008 if (stonith_remote_op_list) {
2009 op = g_hash_table_lookup(stonith_remote_op_list, id);
2010 }
2011
2012 if (op == NULL && rc == pcmk_ok) {
2013
2014 const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2015
2016 op = create_remote_stonith_op(client_id, dev, TRUE);
2017 }
2018
2019 if (op == NULL) {
2020
2021
2022 crm_info("Received peer result of unknown or expired operation %s", id);
2023 return -EOPNOTSUPP;
2024 }
2025
2026 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2027 crm_err("Received outdated reply for device %s (instead of %s) to "
2028 "fence (%s) %s. Operation already timed out at peer level.",
2029 device, (const char *) op->devices->data, op->action, op->target);
2030 return rc;
2031 }
2032
2033 if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2034 crm_debug("Marking call to %s for %s on behalf of %s@%s.%.8s: %s (%d)",
2035 op->action, op->target, op->client_name, op->id, op->originator,
2036 pcmk_strerror(rc), rc);
2037 if (rc == pcmk_ok) {
2038 op->state = st_done;
2039 } else {
2040 op->state = st_failed;
2041 }
2042 remote_op_done(op, msg, rc, FALSE);
2043 return pcmk_ok;
2044 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2045
2046
2047 crm_err
2048 ("%s received non-broadcast fencing result for operation it does not own (device %s targeting %s)",
2049 stonith_our_uname, device, op->target);
2050 return rc;
2051 }
2052
2053 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2054 const char *device = crm_element_value(msg, F_STONITH_DEVICE);
2055
2056 crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s "
2057 CRM_XS " rc=%d",
2058 op->action, op->target, device, op->client_name,
2059 op->originator, pcmk_strerror(rc), rc);
2060
2061
2062
2063 if (op->state == st_done) {
2064 remote_op_done(op, msg, rc, FALSE);
2065 return rc;
2066 }
2067
2068 if ((op->phase == 2) && (rc != pcmk_ok)) {
2069
2070
2071
2072 crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s "
2073 "after successful 'off'", device, rc, op->target);
2074 rc = pcmk_ok;
2075 }
2076
2077 if (rc == pcmk_ok) {
2078
2079
2080 advance_topology_device_in_level(op, device, msg, rc);
2081 return rc;
2082 } else {
2083
2084
2085 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2086 op->state = st_failed;
2087 remote_op_done(op, msg, rc, FALSE);
2088 return rc;
2089 }
2090 }
2091 } else if (rc == pcmk_ok && op->devices == NULL) {
2092 crm_trace("All done for %s", op->target);
2093
2094 op->state = st_done;
2095 remote_op_done(op, msg, rc, FALSE);
2096 return rc;
2097 } else if (rc == -ETIME && op->devices == NULL) {
2098
2099 op->state = st_failed;
2100 remote_op_done(op, msg, rc, FALSE);
2101 return rc;
2102 } else {
2103
2104 }
2105
2106
2107 crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
2108 op->client_name, rc);
2109 call_remote_stonith(op, NULL, rc);
2110 return rc;
2111 }
2112
2113 gboolean
2114 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2115 {
2116 GHashTableIter iter;
2117 time_t now = time(NULL);
2118 remote_fencing_op_t *rop = NULL;
2119
2120 crm_trace("tolerance=%d, stonith_remote_op_list=%p", tolerance,
2121 stonith_remote_op_list);
2122
2123 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2124 action == NULL) {
2125 return FALSE;
2126 }
2127
2128 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2129 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2130 if (strcmp(rop->target, target) != 0) {
2131 continue;
2132 } else if (rop->state != st_done) {
2133 continue;
2134
2135
2136
2137 } else if (strcmp(rop->action, action) != 0) {
2138 continue;
2139 } else if ((rop->completed + tolerance) < now) {
2140 continue;
2141 }
2142
2143 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2144 target, action, tolerance, rop->delegate, rop->originator);
2145 return TRUE;
2146 }
2147 return FALSE;
2148 }