This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- create_op_done_notify
- stonith_bcast_result_to_peers
- handle_local_reply_and_notify
- handle_duplicates
- delegate_from_xml
- remote_op_done
- remote_op_watchdog_done
- remote_op_timeout_one
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- stonith_manual_ack
- create_remote_stonith_op
- initiate_remote_stonith_op
- find_best_peer
- stonith_choose_peer
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- check_watchdog_fencing_and_wait
- call_remote_stonith
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- process_remote_stonith_exec
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/msg_xml.h>
28 #include <crm/common/ipc.h>
29 #include <crm/common/ipc_internal.h>
30 #include <crm/cluster/internal.h>
31
32 #include <crm/stonith-ng.h>
33 #include <crm/fencing/internal.h>
34 #include <crm/common/xml.h>
35 #include <crm/common/xml_internal.h>
36
37 #include <crm/common/util.h>
38 #include <pacemaker-fenced.h>
39
40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
41
42
43
44
45
46
47
48 typedef struct device_properties_s {
49
50 gboolean verified;
51
52
53
54
55 gboolean executed[st_phase_max];
56
57 gboolean disallowed[st_phase_max];
58
59 int custom_action_timeout[st_phase_max];
60
61 int delay_max[st_phase_max];
62
63 int delay_base[st_phase_max];
64 } device_properties_t;
65
66 typedef struct st_query_result_s {
67
68 char *host;
69
70 gboolean tried;
71
72 int ndevices;
73
74 GHashTable *devices;
75 } st_query_result_t;
76
77 GHashTable *stonith_remote_op_list = NULL;
78
79 void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc);
80 static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup);
81 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
82 int call_options);
83
84 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
85 static int get_op_total_timeout(const remote_fencing_op_t *op,
86 const st_query_result_t *chosen_peer);
87
88 static gint
89 sort_strings(gconstpointer a, gconstpointer b)
90 {
91 return strcmp(a, b);
92 }
93
94 static void
95 free_remote_query(gpointer data)
96 {
97 if (data) {
98 st_query_result_t *query = data;
99
100 crm_trace("Free'ing query result from %s", query->host);
101 g_hash_table_destroy(query->devices);
102 free(query->host);
103 free(query);
104 }
105 }
106
107 void
108 free_stonith_remote_op_list()
109 {
110 if (stonith_remote_op_list != NULL) {
111 g_hash_table_destroy(stonith_remote_op_list);
112 stonith_remote_op_list = NULL;
113 }
114 }
115
116 struct peer_count_data {
117 const remote_fencing_op_t *op;
118 gboolean verified_only;
119 int count;
120 };
121
122
123
124
125
126
127
128
129
130 static void
131 count_peer_device(gpointer key, gpointer value, gpointer user_data)
132 {
133 device_properties_t *props = (device_properties_t*)value;
134 struct peer_count_data *data = user_data;
135
136 if (!props->executed[data->op->phase]
137 && (!data->verified_only || props->verified)) {
138 ++(data->count);
139 }
140 }
141
142
143
144
145
146
147
148
149
150
151
152 static int
153 count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer,
154 gboolean verified_only)
155 {
156 struct peer_count_data data;
157
158 data.op = op;
159 data.verified_only = verified_only;
160 data.count = 0;
161 if (peer) {
162 g_hash_table_foreach(peer->devices, count_peer_device, &data);
163 }
164 return data.count;
165 }
166
167
168
169
170
171
172
173
174
175
176
177 static device_properties_t *
178 find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer,
179 const char *device)
180 {
181 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
182
183 return (props && !props->executed[op->phase]
184 && !props->disallowed[op->phase])? props : NULL;
185 }
186
187
188
189
190
191
192
193
194
195
196
197
198 static gboolean
199 grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer,
200 const char *device, gboolean verified_devices_only)
201 {
202 device_properties_t *props = find_peer_device(op, peer, device);
203
204 if ((props == NULL) || (verified_devices_only && !props->verified)) {
205 return FALSE;
206 }
207
208 crm_trace("Removing %s from %s (%d remaining)",
209 device, peer->host, count_peer_devices(op, peer, FALSE));
210 props->executed[op->phase] = TRUE;
211 return TRUE;
212 }
213
214 static void
215 clear_remote_op_timers(remote_fencing_op_t * op)
216 {
217 if (op->query_timer) {
218 g_source_remove(op->query_timer);
219 op->query_timer = 0;
220 }
221 if (op->op_timer_total) {
222 g_source_remove(op->op_timer_total);
223 op->op_timer_total = 0;
224 }
225 if (op->op_timer_one) {
226 g_source_remove(op->op_timer_one);
227 op->op_timer_one = 0;
228 }
229 }
230
231 static void
232 free_remote_op(gpointer data)
233 {
234 remote_fencing_op_t *op = data;
235
236 crm_log_xml_debug(op->request, "Destroying");
237
238 clear_remote_op_timers(op);
239
240 free(op->id);
241 free(op->action);
242 free(op->delegate);
243 free(op->target);
244 free(op->client_id);
245 free(op->client_name);
246 free(op->originator);
247
248 if (op->query_results) {
249 g_list_free_full(op->query_results, free_remote_query);
250 }
251 if (op->request) {
252 free_xml(op->request);
253 op->request = NULL;
254 }
255 if (op->devices_list) {
256 g_list_free_full(op->devices_list, free);
257 op->devices_list = NULL;
258 }
259 g_list_free_full(op->automatic_list, free);
260 g_list_free(op->duplicates);
261 free(op);
262 }
263
264 void
265 init_stonith_remote_op_hash_table(GHashTable **table)
266 {
267 if (*table == NULL) {
268 *table = pcmk__strkey_table(NULL, free_remote_op);
269 }
270 }
271
272
273
274
275
276
277
278
279
280 static const char *
281 op_requested_action(const remote_fencing_op_t *op)
282 {
283 return ((op->phase > st_phase_requested)? "reboot" : op->action);
284 }
285
286
287
288
289
290
291
292 static void
293 op_phase_off(remote_fencing_op_t *op)
294 {
295 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
296 CRM_XS " id=%.8s", op->target, op->id);
297 op->phase = st_phase_off;
298
299
300
301
302 strcpy(op->action, "off");
303 }
304
305
306
307
308
309
310
311 static void
312 op_phase_on(remote_fencing_op_t *op)
313 {
314 GList *iter = NULL;
315
316 crm_info("Remapped 'off' targeting %s complete, "
317 "remapping to 'on' for %s " CRM_XS " id=%.8s",
318 op->target, op->client_name, op->id);
319 op->phase = st_phase_on;
320 strcpy(op->action, "on");
321
322
323
324
325 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
326 GList *match = g_list_find_custom(op->devices_list, iter->data,
327 sort_strings);
328
329 if (match) {
330 op->devices_list = g_list_remove(op->devices_list, match->data);
331 }
332 }
333 g_list_free_full(op->automatic_list, free);
334 op->automatic_list = NULL;
335
336
337 op->devices = op->devices_list;
338 }
339
340
341
342
343
344
345
346 static void
347 undo_op_remap(remote_fencing_op_t *op)
348 {
349 if (op->phase > 0) {
350 crm_info("Undoing remap of reboot targeting %s for %s "
351 CRM_XS " id=%.8s", op->target, op->client_name, op->id);
352 op->phase = st_phase_requested;
353 strcpy(op->action, "reboot");
354 }
355 }
356
357 static xmlNode *
358 create_op_done_notify(remote_fencing_op_t * op, int rc)
359 {
360 xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
361
362 crm_xml_add_int(notify_data, "state", op->state);
363 crm_xml_add_int(notify_data, F_STONITH_RC, rc);
364 crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
365 crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
366 crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
367 crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
368 crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
369 crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
370 crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
371
372 return notify_data;
373 }
374
375 void
376 stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged)
377 {
378 static int count = 0;
379 xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
380 xmlNode *notify_data = create_op_done_notify(op, rc);
381
382 count++;
383 crm_trace("Broadcasting result to peers");
384 crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
385 crm_xml_add(bcast, F_SUBTYPE, "broadcast");
386 crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
387 crm_xml_add_int(bcast, "count", count);
388
389 if (op_merged) {
390 crm_xml_add(bcast, F_STONITH_MERGED, "true");
391 }
392
393 add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
394 send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
395 free_xml(notify_data);
396 free_xml(bcast);
397
398 return;
399 }
400
401 static void
402 handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc)
403 {
404 xmlNode *notify_data = NULL;
405 xmlNode *reply = NULL;
406
407 if (op->notify_sent == TRUE) {
408
409 return;
410 }
411
412
413 notify_data = create_op_done_notify(op, rc);
414 crm_xml_add_int(data, "state", op->state);
415 crm_xml_add(data, F_STONITH_TARGET, op->target);
416 crm_xml_add(data, F_STONITH_OPERATION, op->action);
417
418 reply = stonith_construct_reply(op->request, NULL, data, rc);
419 crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
420
421
422 do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE);
423
424
425 do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data);
426 do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
427
428
429 op->notify_sent = TRUE;
430 free_xml(reply);
431 free_xml(notify_data);
432 }
433
434 static void
435 handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc)
436 {
437 GList *iter = NULL;
438
439 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
440 remote_fencing_op_t *other = iter->data;
441
442 if (other->state == st_duplicate) {
443 other->state = op->state;
444 crm_debug("Performing duplicate notification for %s@%s: %s "
445 CRM_XS " id=%.8s",
446 other->client_name, other->originator,
447 pcmk_strerror(rc), other->id);
448 remote_op_done(other, data, rc, TRUE);
449
450 } else {
451
452 crm_err("Skipping duplicate notification for %s@%s "
453 CRM_XS " state=%s id=%.8s",
454 other->client_name, other->originator,
455 stonith_op_state_str(other->state), other->id);
456 }
457 }
458 }
459
460 static char *
461 delegate_from_xml(xmlNode *xml)
462 {
463 xmlNode *match = get_xpath_object("//@" F_STONITH_DELEGATE, xml, LOG_NEVER);
464
465 if (match == NULL) {
466 return crm_element_value_copy(xml, F_ORIG);
467 } else {
468 return crm_element_value_copy(match, F_STONITH_DELEGATE);
469 }
470 }
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497 static void
498 remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
499 {
500 int level = LOG_ERR;
501 const char *subt = NULL;
502 xmlNode *local_data = NULL;
503 gboolean op_merged = FALSE;
504
505 set_fencing_completed(op);
506 clear_remote_op_timers(op);
507 undo_op_remap(op);
508
509 if (op->notify_sent == TRUE) {
510 crm_err("Already sent notifications for '%s' targeting %s by %s for "
511 "client %s@%s: %s " CRM_XS " rc=%d state=%s id=%.8s",
512 op->action, op->target,
513 (op->delegate? op->delegate : "unknown node"),
514 op->client_name, op->originator, pcmk_strerror(rc),
515 rc, stonith_op_state_str(op->state), op->id);
516 goto remote_op_done_cleanup;
517 }
518
519 if (data == NULL) {
520 data = create_xml_node(NULL, "remote-op");
521 local_data = data;
522
523 } else if (op->delegate == NULL) {
524 switch (rc) {
525 case -ENODEV:
526 case -EHOSTUNREACH:
527 break;
528 default:
529 op->delegate = delegate_from_xml(data);
530 break;
531 }
532 }
533
534 if(dup) {
535 op_merged = TRUE;
536 } else if (crm_element_value(data, F_STONITH_MERGED)) {
537 op_merged = TRUE;
538 }
539
540
541
542
543 subt = crm_element_value(data, F_SUBTYPE);
544 if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
545
546 stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE));
547 goto remote_op_done_cleanup;
548 }
549
550 if (rc == pcmk_ok || dup) {
551 level = LOG_NOTICE;
552 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
553 level = LOG_NOTICE;
554 }
555
556 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s "
557 CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
558 (op->target? op->target : ""),
559 (op->delegate? op->delegate : "unknown node"),
560 op->client_name, op->originator,
561 (op_merged? " (merged)" : ""), pcmk_strerror(rc), op->id);
562
563 handle_local_reply_and_notify(op, data, rc);
564
565 if (dup == FALSE) {
566 handle_duplicates(op, data, rc);
567 }
568
569
570
571
572 if (op->query_results) {
573 g_list_free_full(op->query_results, free_remote_query);
574 op->query_results = NULL;
575 }
576
577 if (op->request) {
578 free_xml(op->request);
579 op->request = NULL;
580 }
581
582 remote_op_done_cleanup:
583 free_xml(local_data);
584 }
585
586 static gboolean
587 remote_op_watchdog_done(gpointer userdata)
588 {
589 remote_fencing_op_t *op = userdata;
590
591 op->op_timer_one = 0;
592
593 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
594 CRM_XS " id=%.8s",
595 op->action, op->target, op->client_name, op->id);
596 op->state = st_done;
597 remote_op_done(op, NULL, pcmk_ok, FALSE);
598 return FALSE;
599 }
600
601 static gboolean
602 remote_op_timeout_one(gpointer userdata)
603 {
604 remote_fencing_op_t *op = userdata;
605
606 op->op_timer_one = 0;
607
608 crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
609 " id=%.8s", op->action, op->target, op->client_name, op->id);
610 call_remote_stonith(op, NULL, pcmk_ok);
611 return FALSE;
612 }
613
614 static gboolean
615 remote_op_timeout(gpointer userdata)
616 {
617 remote_fencing_op_t *op = userdata;
618
619 op->op_timer_total = 0;
620
621 if (op->state == st_done) {
622 crm_debug("Action '%s' targeting %s for client %s already completed "
623 CRM_XS " id=%.8s",
624 op->action, op->target, op->client_name, op->id);
625 return FALSE;
626 }
627
628 crm_debug("Action '%s' targeting %s for client %s timed out "
629 CRM_XS " id=%.8s",
630 op->action, op->target, op->client_name, op->id);
631
632 if (op->phase == st_phase_on) {
633
634
635
636
637 op->state = st_done;
638 remote_op_done(op, NULL, pcmk_ok, FALSE);
639 return FALSE;
640 }
641
642 op->state = st_failed;
643
644 remote_op_done(op, NULL, -ETIME, FALSE);
645
646 return FALSE;
647 }
648
649 static gboolean
650 remote_op_query_timeout(gpointer data)
651 {
652 remote_fencing_op_t *op = data;
653
654 op->query_timer = 0;
655 if (op->state == st_done) {
656 crm_debug("Operation %.8s targeting %s already completed",
657 op->id, op->target);
658 } else if (op->state == st_exec) {
659 crm_debug("Operation %.8s targeting %s already in progress",
660 op->id, op->target);
661 } else if (op->query_results) {
662 crm_debug("Query %.8s targeting %s complete (state=%s)",
663 op->id, op->target, stonith_op_state_str(op->state));
664 call_remote_stonith(op, NULL, pcmk_ok);
665 } else {
666 crm_debug("Query %.8s targeting %s timed out (state=%s)",
667 op->id, op->target, stonith_op_state_str(op->state));
668 if (op->op_timer_total) {
669 g_source_remove(op->op_timer_total);
670 op->op_timer_total = 0;
671 }
672 remote_op_timeout(op);
673 }
674
675 return FALSE;
676 }
677
678 static gboolean
679 topology_is_empty(stonith_topology_t *tp)
680 {
681 int i;
682
683 if (tp == NULL) {
684 return TRUE;
685 }
686
687 for (i = 0; i < ST_LEVEL_MAX; i++) {
688 if (tp->levels[i] != NULL) {
689 return FALSE;
690 }
691 }
692 return TRUE;
693 }
694
695
696
697
698
699
700
701
702 static void
703 add_required_device(remote_fencing_op_t *op, const char *device)
704 {
705 GList *match = g_list_find_custom(op->automatic_list, device,
706 sort_strings);
707
708 if (!match) {
709 op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
710 }
711 }
712
713
714
715
716
717
718
719
720 static void
721 remove_required_device(remote_fencing_op_t *op, const char *device)
722 {
723 GList *match = g_list_find_custom(op->automatic_list, device,
724 sort_strings);
725
726 if (match) {
727 op->automatic_list = g_list_remove(op->automatic_list, match->data);
728 }
729 }
730
731
732 static void
733 set_op_device_list(remote_fencing_op_t * op, GList *devices)
734 {
735 GList *lpc = NULL;
736
737 if (op->devices_list) {
738 g_list_free_full(op->devices_list, free);
739 op->devices_list = NULL;
740 }
741 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
742 op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
743 }
744 op->devices = op->devices_list;
745 }
746
747
748
749
750
751
752
753
754
755
756 static gboolean
757 topology_matches(const stonith_topology_t *tp, const char *node)
758 {
759 regex_t r_patt;
760
761 CRM_CHECK(node && tp && tp->target, return FALSE);
762 switch(tp->kind) {
763 case 2:
764
765
766
767
768
769
770 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
771 crm_notice("Matched %s with %s by attribute", node, tp->target);
772 return TRUE;
773 }
774 break;
775 case 1:
776
777
778
779
780 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
781 crm_info("Bad regex '%s' for fencing level", tp->target);
782 } else {
783 int status = regexec(&r_patt, node, 0, NULL, 0);
784
785 regfree(&r_patt);
786 if (status == 0) {
787 crm_notice("Matched %s with %s by name", node, tp->target);
788 return TRUE;
789 }
790 }
791 break;
792 case 0:
793 crm_trace("Testing %s against %s", node, tp->target);
794 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
795 }
796 crm_trace("No match for %s with %s", node, tp->target);
797 return FALSE;
798 }
799
800 stonith_topology_t *
801 find_topology_for_host(const char *host)
802 {
803 GHashTableIter tIter;
804 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
805
806 if(tp != NULL) {
807 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
808 return tp;
809 }
810
811 g_hash_table_iter_init(&tIter, topology);
812 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
813 if (topology_matches(tp, host)) {
814 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
815 return tp;
816 }
817 }
818
819 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
820 return NULL;
821 }
822
823
824
825
826
827
828
829
830
831
832
833
834 static int
835 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
836 {
837 stonith_topology_t *tp = NULL;
838
839 if (op->target) {
840 tp = find_topology_for_host(op->target);
841 }
842 if (topology_is_empty(tp)) {
843 return empty_ok? pcmk_rc_ok : ENODEV;
844 }
845
846 CRM_ASSERT(tp->levels != NULL);
847
848 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
849
850
851 undo_op_remap(op);
852
853 do {
854 op->level++;
855
856 } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
857
858 if (op->level < ST_LEVEL_MAX) {
859 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
860 "for client %s@%s (id=%.8s)",
861 op->level, op->target, g_list_length(tp->levels[op->level]),
862 op->client_name, op->originator, op->id);
863 set_op_device_list(op, tp->levels[op->level]);
864
865
866 if (op->level > 1 && op->delay > 0) {
867 op->delay = 0;
868 }
869
870 if (g_list_next(op->devices_list) && pcmk__str_eq(op->action, "reboot", pcmk__str_casei)) {
871
872
873
874
875
876 op_phase_off(op);
877 }
878 return pcmk_rc_ok;
879 }
880
881 crm_notice("All fencing options targeting %s for client %s@%s failed "
882 CRM_XS " id=%.8s",
883 op->target, op->client_name, op->originator, op->id);
884 return ENODEV;
885 }
886
887
888
889
890
891
892 static void
893 merge_duplicates(remote_fencing_op_t * op)
894 {
895 GHashTableIter iter;
896 remote_fencing_op_t *other = NULL;
897
898 time_t now = time(NULL);
899
900 g_hash_table_iter_init(&iter, stonith_remote_op_list);
901 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
902 const char *other_action = op_requested_action(other);
903
904 if (!strcmp(op->id, other->id)) {
905 continue;
906 }
907 if (other->state > st_exec) {
908 crm_trace("%.8s not duplicate of %.8s: not in progress",
909 op->id, other->id);
910 continue;
911 }
912 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
913 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
914 op->id, other->id, op->target, other->target);
915 continue;
916 }
917 if (!pcmk__str_eq(op->action, other_action, pcmk__str_casei)) {
918 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
919 op->id, other->id, op->action, other_action);
920 continue;
921 }
922 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
923 crm_trace("%.8s not duplicate of %.8s: same client %s",
924 op->id, other->id, op->client_name);
925 continue;
926 }
927 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
928 crm_trace("%.8s not duplicate of %.8s: suicide for %s",
929 op->id, other->id, other->target);
930 continue;
931 }
932 if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
933 crm_notice("Failing action '%s' targeting %s originating from "
934 "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
935 other->action, other->target, other->client_name,
936 other->originator, other->id);
937 crm_trace("%.8s not duplicate of %.8s: originator dead",
938 op->id, other->id);
939 other->state = st_failed;
940 continue;
941 }
942 if ((other->total_timeout > 0)
943 && (now > (other->total_timeout + other->created))) {
944 crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
945 op->id, other->id, now, other->created,
946 other->total_timeout);
947 continue;
948 }
949
950
951
952
953 other->duplicates = g_list_append(other->duplicates, op);
954 if (other->total_timeout == 0) {
955 other->total_timeout = op->total_timeout =
956 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
957 crm_trace("Best guess as to timeout used for %.8s: %d",
958 other->id, other->total_timeout);
959 }
960 crm_notice("Merging fencing action '%s' targeting %s originating from "
961 "client %s with identical request from %s@%s "
962 CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
963 op->action, op->target, op->client_name,
964 other->client_name, other->originator,
965 op->id, other->id, other->total_timeout);
966 report_timeout_period(op, other->total_timeout);
967 op->state = st_duplicate;
968 }
969 }
970
971 static uint32_t fencing_active_peers(void)
972 {
973 uint32_t count = 0;
974 crm_node_t *entry;
975 GHashTableIter gIter;
976
977 g_hash_table_iter_init(&gIter, crm_peer_cache);
978 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
979 if(fencing_peer_active(entry)) {
980 count++;
981 }
982 }
983 return count;
984 }
985
986 int
987 stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op)
988 {
989 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
990
991 op->state = st_done;
992 set_fencing_completed(op);
993 op->delegate = strdup("a human");
994
995 crm_notice("Injecting manual confirmation that %s is safely off/down",
996 crm_element_value(dev, F_STONITH_TARGET));
997
998 remote_op_done(op, msg, pcmk_ok, FALSE);
999
1000
1001 return -EINPROGRESS;
1002 }
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015 void *
1016 create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer)
1017 {
1018 remote_fencing_op_t *op = NULL;
1019 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
1020 int call_options = 0;
1021 const char *operation = NULL;
1022
1023 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1024
1025
1026
1027 if (peer && dev) {
1028 const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1029
1030 CRM_CHECK(op_id != NULL, return NULL);
1031
1032 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1033 if (op) {
1034 crm_debug("Reusing existing remote fencing op %.8s for %s",
1035 op_id, ((client == NULL)? "unknown client" : client));
1036 return op;
1037 }
1038 }
1039
1040 op = calloc(1, sizeof(remote_fencing_op_t));
1041 CRM_ASSERT(op != NULL);
1042
1043 crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1044
1045 crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
1046
1047 if (peer && dev) {
1048 op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1049 } else {
1050 op->id = crm_generate_uuid();
1051 }
1052
1053 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1054
1055 op->state = st_query;
1056 op->replies_expected = fencing_active_peers();
1057 op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1058 op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1059 op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE);
1060 op->created = time(NULL);
1061
1062 if (op->originator == NULL) {
1063
1064 op->originator = strdup(stonith_our_uname);
1065 }
1066
1067 CRM_LOG_ASSERT(client != NULL);
1068 if (client) {
1069 op->client_id = strdup(client);
1070 }
1071
1072
1073
1074 operation = crm_element_value(request, F_STONITH_OPERATION);
1075
1076 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1077 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1078 (unsigned long) getpid());
1079 } else {
1080 op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1081 }
1082
1083 op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1084 op->request = copy_xml(request);
1085 crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1086 op->call_options = call_options;
1087
1088 crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1089
1090 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1091 "base timeout %d, %u %s expected)",
1092 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1093 op->target, op->client_name, op->base_timeout,
1094 op->replies_expected,
1095 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1096
1097 if (op->call_options & st_opt_cs_nodeid) {
1098 int nodeid;
1099 crm_node_t *node;
1100
1101 pcmk__scan_min_int(op->target, &nodeid, 0);
1102 node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
1103
1104
1105 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1106
1107 if (node && node->uname) {
1108 free(op->target);
1109 op->target = strdup(node->uname);
1110
1111 } else {
1112 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1113 }
1114 }
1115
1116
1117 merge_duplicates(op);
1118
1119 if (op->state != st_duplicate) {
1120
1121 do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
1122 }
1123
1124
1125 stonith_fence_history_trim();
1126
1127 return op;
1128 }
1129
1130 remote_fencing_op_t *
1131 initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request,
1132 gboolean manual_ack)
1133 {
1134 int query_timeout = 0;
1135 xmlNode *query = NULL;
1136 const char *client_id = NULL;
1137 remote_fencing_op_t *op = NULL;
1138 const char *relay_op_id = NULL;
1139 const char *operation = NULL;
1140
1141 if (client) {
1142 client_id = client->id;
1143 } else {
1144 client_id = crm_element_value(request, F_STONITH_CLIENTID);
1145 }
1146
1147 CRM_LOG_ASSERT(client_id != NULL);
1148 op = create_remote_stonith_op(client_id, request, FALSE);
1149 op->owner = TRUE;
1150 if (manual_ack) {
1151 crm_notice("Processing manual confirmation of fencing targeting %s "
1152 CRM_XS " id=%.8s", op->target, op->id);
1153 return op;
1154 }
1155
1156 CRM_CHECK(op->action, return NULL);
1157
1158 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1159 op->state = st_failed;
1160 }
1161
1162 switch (op->state) {
1163 case st_failed:
1164 crm_warn("Could not request peer fencing (%s) targeting %s "
1165 CRM_XS " id=%.8s", op->action, op->target, op->id);
1166 remote_op_done(op, NULL, -EINVAL, FALSE);
1167 return op;
1168
1169 case st_duplicate:
1170 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1171 CRM_XS " id=%.8s", op->action, op->target, op->id);
1172 return op;
1173
1174 default:
1175 crm_notice("Requesting peer fencing (%s) targeting %s "
1176 CRM_XS " id=%.8s state=%s base_timeout=%d",
1177 op->action, op->target, op->id,
1178 stonith_op_state_str(op->state), op->base_timeout);
1179 }
1180
1181 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1182 NULL, op->call_options);
1183
1184 crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1185 crm_xml_add(query, F_STONITH_TARGET, op->target);
1186 crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1187 crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1188 crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1189 crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1190 crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1191
1192
1193 operation = crm_element_value(request, F_STONITH_OPERATION);
1194 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1195 relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1196 if (relay_op_id) {
1197 crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1198 }
1199 }
1200
1201 send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1202 free_xml(query);
1203
1204 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1205 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1206
1207 return op;
1208 }
1209
1210 enum find_best_peer_options {
1211
1212 FIND_PEER_SKIP_TARGET = 0x0001,
1213
1214 FIND_PEER_TARGET_ONLY = 0x0002,
1215
1216 FIND_PEER_VERIFIED_ONLY = 0x0004,
1217 };
1218
1219 static st_query_result_t *
1220 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1221 {
1222 GList *iter = NULL;
1223 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1224
1225 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1226 return NULL;
1227 }
1228
1229 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1230 st_query_result_t *peer = iter->data;
1231
1232 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1233 peer->host, op->target, peer->ndevices,
1234 pcmk__plural_s(peer->ndevices), peer->tried, options);
1235 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1236 continue;
1237 }
1238 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1239 continue;
1240 }
1241
1242 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1243
1244 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1245 return peer;
1246 }
1247
1248 } else if ((peer->tried == FALSE)
1249 && count_peer_devices(op, peer, verified_devices_only)) {
1250
1251
1252 crm_trace("Simple fencing");
1253 return peer;
1254 }
1255 }
1256
1257 return NULL;
1258 }
1259
1260 static st_query_result_t *
1261 stonith_choose_peer(remote_fencing_op_t * op)
1262 {
1263 const char *device = NULL;
1264 st_query_result_t *peer = NULL;
1265 uint32_t active = fencing_active_peers();
1266
1267 do {
1268 if (op->devices) {
1269 device = op->devices->data;
1270 crm_trace("Checking for someone to fence (%s) %s using %s",
1271 op->action, op->target, device);
1272 } else {
1273 crm_trace("Checking for someone to fence (%s) %s",
1274 op->action, op->target);
1275 }
1276
1277
1278 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1279 if (peer) {
1280 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1281 return peer;
1282 }
1283
1284 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1285 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1286 return NULL;
1287 }
1288
1289
1290 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1291 if (peer) {
1292 crm_trace("Found best unverified peer %s", peer->host);
1293 return peer;
1294 }
1295
1296
1297
1298
1299 if (op->phase != st_phase_on) {
1300 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1301 if (peer) {
1302 crm_trace("%s will fence itself", peer->host);
1303 return peer;
1304 }
1305 }
1306
1307
1308
1309
1310 } while ((op->phase != st_phase_on)
1311 && pcmk_is_set(op->call_options, st_opt_topology)
1312 && (advance_topology_level(op, false) == pcmk_rc_ok));
1313
1314 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1315 op->action, op->target, (device? device : "any device"));
1316 return NULL;
1317 }
1318
1319 static int
1320 get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer,
1321 const char *device)
1322 {
1323 device_properties_t *props;
1324
1325 if (!peer || !device) {
1326 return op->base_timeout;
1327 }
1328
1329 props = g_hash_table_lookup(peer->devices, device);
1330 if (!props) {
1331 return op->base_timeout;
1332 }
1333
1334 return (props->custom_action_timeout[op->phase]?
1335 props->custom_action_timeout[op->phase] : op->base_timeout)
1336 + props->delay_max[op->phase];
1337 }
1338
1339 struct timeout_data {
1340 const remote_fencing_op_t *op;
1341 const st_query_result_t *peer;
1342 int total_timeout;
1343 };
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353 static void
1354 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1355 {
1356 const char *device_id = key;
1357 device_properties_t *props = value;
1358 struct timeout_data *timeout = user_data;
1359
1360 if (!props->executed[timeout->op->phase]
1361 && !props->disallowed[timeout->op->phase]) {
1362 timeout->total_timeout += get_device_timeout(timeout->op,
1363 timeout->peer, device_id);
1364 }
1365 }
1366
1367 static int
1368 get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer)
1369 {
1370 struct timeout_data timeout;
1371
1372 timeout.op = op;
1373 timeout.peer = peer;
1374 timeout.total_timeout = 0;
1375
1376 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1377
1378 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1379 }
1380
1381 static int
1382 get_op_total_timeout(const remote_fencing_op_t *op,
1383 const st_query_result_t *chosen_peer)
1384 {
1385 int total_timeout = 0;
1386 stonith_topology_t *tp = find_topology_for_host(op->target);
1387
1388 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1389 int i;
1390 GList *device_list = NULL;
1391 GList *iter = NULL;
1392
1393
1394
1395
1396
1397
1398
1399
1400 for (i = 0; i < ST_LEVEL_MAX; i++) {
1401 if (!tp->levels[i]) {
1402 continue;
1403 }
1404 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1405 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1406 const st_query_result_t *peer = iter->data;
1407
1408 if (find_peer_device(op, peer, device_list->data)) {
1409 total_timeout += get_device_timeout(op, peer,
1410 device_list->data);
1411 break;
1412 }
1413 }
1414 }
1415 }
1416
1417 } else if (chosen_peer) {
1418 total_timeout = get_peer_timeout(op, chosen_peer);
1419 } else {
1420 total_timeout = op->base_timeout;
1421 }
1422
1423 return total_timeout ? total_timeout : op->base_timeout;
1424 }
1425
1426 static void
1427 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1428 {
1429 GList *iter = NULL;
1430 xmlNode *update = NULL;
1431 const char *client_node = NULL;
1432 const char *client_id = NULL;
1433 const char *call_id = NULL;
1434
1435 if (op->call_options & st_opt_sync_call) {
1436
1437
1438
1439
1440 return;
1441 } else if (!op->request) {
1442 return;
1443 }
1444
1445 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1446 client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1447 call_id = crm_element_value(op->request, F_STONITH_CALLID);
1448 client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1449 if (!client_node || !call_id || !client_id) {
1450 return;
1451 }
1452
1453 if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1454
1455 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1456 return;
1457 }
1458
1459
1460 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1461 crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1462 crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1463 crm_xml_add(update, F_STONITH_CALLID, call_id);
1464 crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1465
1466 send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1467
1468 free_xml(update);
1469
1470 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1471 remote_fencing_op_t *dup = iter->data;
1472
1473 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1474 dup->id, dup->client_name);
1475 report_timeout_period(iter->data, op_timeout);
1476 }
1477 }
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488 static void
1489 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1490 xmlNode *msg, int rc)
1491 {
1492
1493 if (op->devices) {
1494 op->devices = op->devices->next;
1495 }
1496
1497
1498 if ((op->phase == st_phase_requested) && pcmk__str_eq(op->action, "on", pcmk__str_casei)) {
1499
1500 remove_required_device(op, device);
1501
1502
1503
1504
1505 if (op->devices == NULL) {
1506 op->devices = op->automatic_list;
1507 }
1508 }
1509
1510 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1511
1512
1513
1514
1515 op_phase_on(op);
1516 }
1517
1518 if (op->devices) {
1519
1520 crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)",
1521 op->target, op->client_name, op->originator, rc);
1522
1523
1524 if (op->delay > 0) {
1525 op->delay = 0;
1526 }
1527
1528 call_remote_stonith(op, NULL, pcmk_ok);
1529 } else {
1530
1531 crm_trace("Marking complex fencing op targeting %s as complete",
1532 op->target);
1533 op->state = st_done;
1534 remote_op_done(op, msg, rc, FALSE);
1535 }
1536 }
1537
1538 static gboolean
1539 check_watchdog_fencing_and_wait(remote_fencing_op_t * op)
1540 {
1541 if (node_does_watchdog_fencing(op->target)) {
1542
1543 crm_notice("Waiting %lds for %s to self-fence (%s) for "
1544 "client %s " CRM_XS " id=%.8s",
1545 (stonith_watchdog_timeout_ms / 1000),
1546 op->target, op->action, op->client_name, op->id);
1547 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms,
1548 remote_op_watchdog_done, op);
1549 return TRUE;
1550 } else {
1551 crm_debug("Skipping fallback to watchdog-fencing as %s is "
1552 "not in host-list", op->target);
1553 }
1554 return FALSE;
1555 }
1556
1557 void
1558 call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc)
1559 {
1560 const char *device = NULL;
1561 int timeout = op->base_timeout;
1562
1563 crm_trace("Action %.8s targeting %s for %s is %s",
1564 op->id, op->target, op->client_name,
1565 stonith_op_state_str(op->state));
1566 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1567 peer = stonith_choose_peer(op);
1568 }
1569
1570 if (!op->op_timer_total) {
1571 int total_timeout = get_op_total_timeout(op, peer);
1572
1573 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
1574 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1575 report_timeout_period(op, op->total_timeout);
1576 crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1577 CRM_XS "id=%.8s",
1578 total_timeout, op->target, op->client_name, op->id);
1579 }
1580
1581 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1582
1583
1584
1585 peer = stonith_choose_peer(op);
1586
1587 device = op->devices->data;
1588 timeout = get_device_timeout(op, peer, device);
1589 }
1590
1591 if (peer) {
1592 int timeout_one = 0;
1593 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1594
1595 crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1596 crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1597 crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1598 crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1599 crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1600 crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1601 crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1602 crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1603 crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
1604
1605 if (device) {
1606 timeout_one = TIMEOUT_MULTIPLY_FACTOR *
1607 get_device_timeout(op, peer, device);
1608 crm_notice("Requesting that %s perform '%s' action targeting %s "
1609 "using %s " CRM_XS " for client %s (%ds)",
1610 peer->host, op->action, op->target, device,
1611 op->client_name, timeout_one);
1612 crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1613
1614 } else {
1615 timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1616 crm_notice("Requesting that %s perform '%s' action targeting %s "
1617 CRM_XS " for client %s (%ds, %lds)",
1618 peer->host, op->action, op->target, op->client_name,
1619 timeout_one, stonith_watchdog_timeout_ms);
1620 }
1621
1622 op->state = st_exec;
1623 if (op->op_timer_one) {
1624 g_source_remove(op->op_timer_one);
1625 }
1626
1627 if (!(stonith_watchdog_timeout_ms > 0 && (
1628 (pcmk__str_eq(device, STONITH_WATCHDOG_ID,
1629 pcmk__str_none)) ||
1630 (pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1631 && !pcmk__str_eq(op->action, "on", pcmk__str_casei))) &&
1632 check_watchdog_fencing_and_wait(op))) {
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1652 }
1653
1654 send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1655 peer->tried = TRUE;
1656 free_xml(remote_op);
1657 return;
1658
1659 } else if (op->phase == st_phase_on) {
1660
1661
1662
1663 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1664 "after successful 'off'", device, op->target);
1665 advance_topology_device_in_level(op, device, NULL, pcmk_ok);
1666 return;
1667
1668 } else if (op->owner == FALSE) {
1669 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1670 op->action, op->target, op->client_name);
1671
1672 } else if (op->query_timer == 0) {
1673
1674 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1675 CRM_XS " state=%s", op->action, op->target, op->client_name,
1676 stonith_op_state_str(op->state));
1677 CRM_LOG_ASSERT(op->state < st_done);
1678 remote_op_timeout(op);
1679
1680 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1681
1682
1683
1684
1685
1686
1687 if(stonith_watchdog_timeout_ms > 0 && pcmk__str_eq(device,
1688 STONITH_WATCHDOG_ID, pcmk__str_null_matches)) {
1689 if (check_watchdog_fencing_and_wait(op)) {
1690 return;
1691 }
1692 }
1693
1694 if (op->state == st_query) {
1695 crm_info("No peers (out of %d) have devices capable of fencing "
1696 "(%s) %s for client %s " CRM_XS " state=%s",
1697 op->replies, op->action, op->target, op->client_name,
1698 stonith_op_state_str(op->state));
1699
1700 rc = -ENODEV;
1701 } else {
1702 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1703 rc = -EHOSTUNREACH;
1704 }
1705
1706 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1707 "for client %s " CRM_XS " state=%s",
1708 op->replies, op->action, op->target, op->client_name,
1709 stonith_op_state_str(op->state));
1710 }
1711
1712 op->state = st_failed;
1713 remote_op_done(op, NULL, rc, FALSE);
1714
1715 } else {
1716 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1717 "for client %s " CRM_XS " id=%.8s",
1718 op->action, op->target, (device? " using " : ""),
1719 (device? device : ""), op->client_name, op->id);
1720 }
1721 }
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734 static gint
1735 sort_peers(gconstpointer a, gconstpointer b)
1736 {
1737 const st_query_result_t *peer_a = a;
1738 const st_query_result_t *peer_b = b;
1739
1740 return (peer_b->ndevices - peer_a->ndevices);
1741 }
1742
1743
1744
1745
1746
1747 static gboolean
1748 all_topology_devices_found(remote_fencing_op_t * op)
1749 {
1750 GList *device = NULL;
1751 GList *iter = NULL;
1752 device_properties_t *match = NULL;
1753 stonith_topology_t *tp = NULL;
1754 gboolean skip_target = FALSE;
1755 int i;
1756
1757 tp = find_topology_for_host(op->target);
1758 if (!tp) {
1759 return FALSE;
1760 }
1761 if (pcmk__strcase_any_of(op->action, "off", "reboot", NULL)) {
1762
1763
1764 skip_target = TRUE;
1765 }
1766
1767 for (i = 0; i < ST_LEVEL_MAX; i++) {
1768 for (device = tp->levels[i]; device; device = device->next) {
1769 match = NULL;
1770 for (iter = op->query_results; iter && !match; iter = iter->next) {
1771 st_query_result_t *peer = iter->data;
1772
1773 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1774 continue;
1775 }
1776 match = find_peer_device(op, peer, device->data);
1777 }
1778 if (!match) {
1779 return FALSE;
1780 }
1781 }
1782 }
1783
1784 return TRUE;
1785 }
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798 static void
1799 parse_action_specific(xmlNode *xml, const char *peer, const char *device,
1800 const char *action, remote_fencing_op_t *op,
1801 enum st_remap_phase phase, device_properties_t *props)
1802 {
1803 props->custom_action_timeout[phase] = 0;
1804 crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
1805 &props->custom_action_timeout[phase]);
1806 if (props->custom_action_timeout[phase]) {
1807 crm_trace("Peer %s with device %s returned %s action timeout %d",
1808 peer, device, action, props->custom_action_timeout[phase]);
1809 }
1810
1811 props->delay_max[phase] = 0;
1812 crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
1813 if (props->delay_max[phase]) {
1814 crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
1815 peer, device, props->delay_max[phase], action);
1816 }
1817
1818 props->delay_base[phase] = 0;
1819 crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
1820 if (props->delay_base[phase]) {
1821 crm_trace("Peer %s with device %s returned base delay %d for %s",
1822 peer, device, props->delay_base[phase], action);
1823 }
1824
1825
1826 if (pcmk__str_eq(action, "on", pcmk__str_casei)) {
1827 int required = 0;
1828
1829 crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
1830 if (required) {
1831 crm_trace("Peer %s requires device %s to execute for action %s",
1832 peer, device, action);
1833 add_required_device(op, device);
1834 }
1835 }
1836
1837
1838
1839
1840 if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) {
1841 props->disallowed[phase] = TRUE;
1842 crm_trace("Peer %s is disallowed from executing %s for device %s",
1843 peer, action, device);
1844 }
1845 }
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856 static void
1857 add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
1858 st_query_result_t *result, const char *device)
1859 {
1860 xmlNode *child;
1861 int verified = 0;
1862 device_properties_t *props = calloc(1, sizeof(device_properties_t));
1863
1864
1865 CRM_ASSERT(props != NULL);
1866 g_hash_table_insert(result->devices, strdup(device), props);
1867
1868
1869 crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
1870 if (verified) {
1871 crm_trace("Peer %s has confirmed a verified device %s",
1872 result->host, device);
1873 props->verified = TRUE;
1874 }
1875
1876
1877 parse_action_specific(xml, result->host, device, op_requested_action(op),
1878 op, st_phase_requested, props);
1879 for (child = pcmk__xml_first_child(xml); child != NULL;
1880 child = pcmk__xml_next(child)) {
1881
1882
1883
1884
1885 if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) {
1886 parse_action_specific(child, result->host, device, "off",
1887 op, st_phase_off, props);
1888 } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) {
1889 parse_action_specific(child, result->host, device, "on",
1890 op, st_phase_on, props);
1891 }
1892 }
1893 }
1894
1895
1896
1897
1898
1899
1900
1901
1902
1903
1904
1905
1906 static st_query_result_t *
1907 add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
1908 {
1909 st_query_result_t *result = calloc(1, sizeof(st_query_result_t));
1910 xmlNode *child;
1911
1912
1913
1914 CRM_CHECK(result != NULL, return NULL);
1915 result->host = strdup(host);
1916 result->devices = pcmk__strkey_table(free, free);
1917
1918
1919 for (child = pcmk__xml_first_child(xml); child != NULL;
1920 child = pcmk__xml_next(child)) {
1921 const char *device = ID(child);
1922
1923 if (device) {
1924 add_device_properties(child, op, result, device);
1925 }
1926 }
1927
1928 result->ndevices = g_hash_table_size(result->devices);
1929 CRM_CHECK(ndevices == result->ndevices,
1930 crm_err("Query claimed to have %d device%s but %d found",
1931 ndevices, pcmk__plural_s(ndevices), result->ndevices));
1932
1933 op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
1934 return result;
1935 }
1936
1937
1938
1939
1940
1941
1942
1943
1944
1945
1946
1947
1948
1949
1950
1951 int
1952 process_remote_stonith_query(xmlNode * msg)
1953 {
1954 int ndevices = 0;
1955 gboolean host_is_target = FALSE;
1956 gboolean have_all_replies = FALSE;
1957 const char *id = NULL;
1958 const char *host = NULL;
1959 remote_fencing_op_t *op = NULL;
1960 st_query_result_t *result = NULL;
1961 uint32_t replies_expected;
1962 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
1963
1964 CRM_CHECK(dev != NULL, return -EPROTO);
1965
1966 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1967 CRM_CHECK(id != NULL, return -EPROTO);
1968
1969 dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
1970 CRM_CHECK(dev != NULL, return -EPROTO);
1971 crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
1972
1973 op = g_hash_table_lookup(stonith_remote_op_list, id);
1974 if (op == NULL) {
1975 crm_debug("Received query reply for unknown or expired operation %s",
1976 id);
1977 return -EOPNOTSUPP;
1978 }
1979
1980 replies_expected = fencing_active_peers();
1981 if (op->replies_expected < replies_expected) {
1982 replies_expected = op->replies_expected;
1983 }
1984 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
1985 have_all_replies = TRUE;
1986 }
1987 host = crm_element_value(msg, F_ORIG);
1988 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
1989
1990 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
1991 op->replies, replies_expected, host,
1992 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
1993 if (ndevices > 0) {
1994 result = add_result(op, host, ndevices, dev);
1995 }
1996
1997 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1998
1999
2000
2001 if (op->state == st_query && all_topology_devices_found(op)) {
2002
2003 crm_trace("All topology devices found");
2004 call_remote_stonith(op, result, pcmk_ok);
2005
2006 } else if (have_all_replies) {
2007 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
2008 replies_expected, op->replies);
2009 call_remote_stonith(op, NULL, pcmk_ok);
2010 }
2011
2012 } else if (op->state == st_query) {
2013 int nverified = count_peer_devices(op, result, TRUE);
2014
2015
2016
2017 if (result && (host_is_target == FALSE) && nverified) {
2018
2019 crm_trace("Found %d verified device%s",
2020 nverified, pcmk__plural_s(nverified));
2021 call_remote_stonith(op, result, pcmk_ok);
2022
2023 } else if (have_all_replies) {
2024 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
2025 replies_expected, op->replies);
2026 call_remote_stonith(op, NULL, pcmk_ok);
2027
2028 } else {
2029 crm_trace("Waiting for more peer results before launching fencing operation");
2030 }
2031
2032 } else if (result && (op->state == st_done)) {
2033 crm_info("Discarding query result from %s (%d device%s): "
2034 "Operation is %s", result->host,
2035 result->ndevices, pcmk__plural_s(result->ndevices),
2036 stonith_op_state_str(op->state));
2037 }
2038
2039 return pcmk_ok;
2040 }
2041
2042
2043
2044
2045
2046
2047
2048
2049
2050
2051
2052
2053 int
2054 process_remote_stonith_exec(xmlNode * msg)
2055 {
2056 int rc = 0;
2057 const char *id = NULL;
2058 const char *device = NULL;
2059 remote_fencing_op_t *op = NULL;
2060 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2061
2062 CRM_CHECK(dev != NULL, return -EPROTO);
2063
2064 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2065 CRM_CHECK(id != NULL, return -EPROTO);
2066
2067 dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
2068 CRM_CHECK(dev != NULL, return -EPROTO);
2069
2070 crm_element_value_int(dev, F_STONITH_RC, &rc);
2071
2072 device = crm_element_value(dev, F_STONITH_DEVICE);
2073
2074 if (stonith_remote_op_list) {
2075 op = g_hash_table_lookup(stonith_remote_op_list, id);
2076 }
2077
2078 if (op == NULL && rc == pcmk_ok) {
2079
2080 const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2081
2082 op = create_remote_stonith_op(client_id, dev, TRUE);
2083 }
2084
2085 if (op == NULL) {
2086
2087
2088 crm_info("Received peer result of unknown or expired operation %s", id);
2089 return -EOPNOTSUPP;
2090 }
2091
2092 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2093 crm_err("Received outdated reply for device %s (instead of %s) to "
2094 "fence (%s) %s. Operation already timed out at peer level.",
2095 device, (const char *) op->devices->data, op->action, op->target);
2096 return rc;
2097 }
2098
2099 if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2100 crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s "
2101 CRM_XS " rc=%d id=%.8s",
2102 op->action, op->target, op->client_name, op->originator,
2103 pcmk_strerror(rc), rc, op->id);
2104 if (rc == pcmk_ok) {
2105 op->state = st_done;
2106 } else {
2107 op->state = st_failed;
2108 }
2109 remote_op_done(op, msg, rc, FALSE);
2110 return pcmk_ok;
2111 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2112
2113
2114 crm_err("Received non-broadcast fencing result for operation %.8s "
2115 "we do not own (device %s targeting %s)",
2116 op->id, device, op->target);
2117 return rc;
2118 }
2119
2120 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2121 const char *device = crm_element_value(msg, F_STONITH_DEVICE);
2122
2123 crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s "
2124 CRM_XS " rc=%d",
2125 op->action, op->target, device, op->client_name,
2126 op->originator, pcmk_strerror(rc), rc);
2127
2128
2129
2130 if (op->state == st_done) {
2131 remote_op_done(op, msg, rc, FALSE);
2132 return rc;
2133 }
2134
2135 if ((op->phase == 2) && (rc != pcmk_ok)) {
2136
2137
2138
2139 crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s "
2140 "after successful 'off'", device, rc, op->target);
2141 rc = pcmk_ok;
2142 }
2143
2144 if (rc == pcmk_ok) {
2145
2146
2147 advance_topology_device_in_level(op, device, msg, rc);
2148 return rc;
2149 } else {
2150
2151
2152 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2153 op->state = st_failed;
2154 remote_op_done(op, msg, rc, FALSE);
2155 return rc;
2156 }
2157 }
2158 } else if (rc == pcmk_ok && op->devices == NULL) {
2159 crm_trace("All done for %s", op->target);
2160
2161 op->state = st_done;
2162 remote_op_done(op, msg, rc, FALSE);
2163 return rc;
2164 } else if (rc == -ETIME && op->devices == NULL) {
2165
2166 op->state = st_failed;
2167 remote_op_done(op, msg, rc, FALSE);
2168 return rc;
2169 } else {
2170
2171 }
2172
2173
2174 crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
2175 op->client_name, rc);
2176 call_remote_stonith(op, NULL, rc);
2177 return rc;
2178 }
2179
2180 gboolean
2181 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2182 {
2183 GHashTableIter iter;
2184 time_t now = time(NULL);
2185 remote_fencing_op_t *rop = NULL;
2186
2187 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2188 action == NULL) {
2189 return FALSE;
2190 }
2191
2192 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2193 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2194 if (strcmp(rop->target, target) != 0) {
2195 continue;
2196 } else if (rop->state != st_done) {
2197 continue;
2198
2199
2200
2201 } else if (strcmp(rop->action, action) != 0) {
2202 continue;
2203 } else if ((rop->completed + tolerance) < now) {
2204 continue;
2205 }
2206
2207 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2208 target, action, tolerance, rop->delegate, rop->originator);
2209 return TRUE;
2210 }
2211 return FALSE;
2212 }