This source file includes following definitions.
- sort_strings
- free_remote_query
- free_stonith_remote_op_list
- count_peer_device
- count_peer_devices
- find_peer_device
- grab_peer_device
- clear_remote_op_timers
- free_remote_op
- init_stonith_remote_op_hash_table
- op_requested_action
- op_phase_off
- op_phase_on
- undo_op_remap
- create_op_done_notify
- stonith_bcast_result_to_peers
- handle_local_reply_and_notify
- handle_duplicates
- remote_op_done
- remote_op_watchdog_done
- remote_op_timeout_one
- remote_op_timeout
- remote_op_query_timeout
- topology_is_empty
- add_required_device
- remove_required_device
- set_op_device_list
- topology_matches
- find_topology_for_host
- advance_topology_level
- merge_duplicates
- fencing_active_peers
- stonith_manual_ack
- create_remote_stonith_op
- initiate_remote_stonith_op
- find_best_peer
- stonith_choose_peer
- get_device_timeout
- add_device_timeout
- get_peer_timeout
- get_op_total_timeout
- report_timeout_period
- advance_topology_device_in_level
- call_remote_stonith
- sort_peers
- all_topology_devices_found
- parse_action_specific
- add_device_properties
- add_result
- process_remote_stonith_query
- process_remote_stonith_exec
- stonith_check_fence_tolerance
1
2
3
4
5
6
7
8
9
10 #include <crm_internal.h>
11
12 #include <sys/param.h>
13 #include <stdio.h>
14 #include <sys/types.h>
15 #include <sys/wait.h>
16 #include <sys/stat.h>
17 #include <unistd.h>
18 #include <sys/utsname.h>
19
20 #include <stdlib.h>
21 #include <errno.h>
22 #include <fcntl.h>
23 #include <ctype.h>
24 #include <regex.h>
25
26 #include <crm/crm.h>
27 #include <crm/msg_xml.h>
28 #include <crm/common/ipc.h>
29 #include <crm/common/ipc_internal.h>
30 #include <crm/cluster/internal.h>
31
32 #include <crm/stonith-ng.h>
33 #include <crm/fencing/internal.h>
34 #include <crm/common/xml.h>
35 #include <crm/common/xml_internal.h>
36
37 #include <crm/common/util.h>
38 #include <pacemaker-fenced.h>
39
40 #define TIMEOUT_MULTIPLY_FACTOR 1.2
41
42
43
44
45
46
47
48 typedef struct device_properties_s {
49
50 gboolean verified;
51
52
53
54
55 gboolean executed[st_phase_max];
56
57 gboolean disallowed[st_phase_max];
58
59 int custom_action_timeout[st_phase_max];
60
61 int delay_max[st_phase_max];
62
63 int delay_base[st_phase_max];
64 } device_properties_t;
65
66 typedef struct st_query_result_s {
67
68 char *host;
69
70 gboolean tried;
71
72 int ndevices;
73
74 GHashTable *devices;
75 } st_query_result_t;
76
77 GHashTable *stonith_remote_op_list = NULL;
78
79 void call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc);
80 static void remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup);
81 extern xmlNode *stonith_create_op(int call_id, const char *token, const char *op, xmlNode * data,
82 int call_options);
83
84 static void report_timeout_period(remote_fencing_op_t * op, int op_timeout);
85 static int get_op_total_timeout(const remote_fencing_op_t *op,
86 const st_query_result_t *chosen_peer);
87
88 static gint
89 sort_strings(gconstpointer a, gconstpointer b)
90 {
91 return strcmp(a, b);
92 }
93
94 static void
95 free_remote_query(gpointer data)
96 {
97 if (data) {
98 st_query_result_t *query = data;
99
100 crm_trace("Free'ing query result from %s", query->host);
101 g_hash_table_destroy(query->devices);
102 free(query->host);
103 free(query);
104 }
105 }
106
107 void
108 free_stonith_remote_op_list()
109 {
110 if (stonith_remote_op_list != NULL) {
111 g_hash_table_destroy(stonith_remote_op_list);
112 stonith_remote_op_list = NULL;
113 }
114 }
115
116 struct peer_count_data {
117 const remote_fencing_op_t *op;
118 gboolean verified_only;
119 int count;
120 };
121
122
123
124
125
126
127
128
129
130 static void
131 count_peer_device(gpointer key, gpointer value, gpointer user_data)
132 {
133 device_properties_t *props = (device_properties_t*)value;
134 struct peer_count_data *data = user_data;
135
136 if (!props->executed[data->op->phase]
137 && (!data->verified_only || props->verified)) {
138 ++(data->count);
139 }
140 }
141
142
143
144
145
146
147
148
149
150
151
152 static int
153 count_peer_devices(const remote_fencing_op_t *op, const st_query_result_t *peer,
154 gboolean verified_only)
155 {
156 struct peer_count_data data;
157
158 data.op = op;
159 data.verified_only = verified_only;
160 data.count = 0;
161 if (peer) {
162 g_hash_table_foreach(peer->devices, count_peer_device, &data);
163 }
164 return data.count;
165 }
166
167
168
169
170
171
172
173
174
175
176
177 static device_properties_t *
178 find_peer_device(const remote_fencing_op_t *op, const st_query_result_t *peer,
179 const char *device)
180 {
181 device_properties_t *props = g_hash_table_lookup(peer->devices, device);
182
183 return (props && !props->executed[op->phase]
184 && !props->disallowed[op->phase])? props : NULL;
185 }
186
187
188
189
190
191
192
193
194
195
196
197
198 static gboolean
199 grab_peer_device(const remote_fencing_op_t *op, st_query_result_t *peer,
200 const char *device, gboolean verified_devices_only)
201 {
202 device_properties_t *props = find_peer_device(op, peer, device);
203
204 if ((props == NULL) || (verified_devices_only && !props->verified)) {
205 return FALSE;
206 }
207
208 crm_trace("Removing %s from %s (%d remaining)",
209 device, peer->host, count_peer_devices(op, peer, FALSE));
210 props->executed[op->phase] = TRUE;
211 return TRUE;
212 }
213
214 static void
215 clear_remote_op_timers(remote_fencing_op_t * op)
216 {
217 if (op->query_timer) {
218 g_source_remove(op->query_timer);
219 op->query_timer = 0;
220 }
221 if (op->op_timer_total) {
222 g_source_remove(op->op_timer_total);
223 op->op_timer_total = 0;
224 }
225 if (op->op_timer_one) {
226 g_source_remove(op->op_timer_one);
227 op->op_timer_one = 0;
228 }
229 }
230
231 static void
232 free_remote_op(gpointer data)
233 {
234 remote_fencing_op_t *op = data;
235
236 crm_log_xml_debug(op->request, "Destroying");
237
238 clear_remote_op_timers(op);
239
240 free(op->id);
241 free(op->action);
242 free(op->delegate);
243 free(op->target);
244 free(op->client_id);
245 free(op->client_name);
246 free(op->originator);
247
248 if (op->query_results) {
249 g_list_free_full(op->query_results, free_remote_query);
250 }
251 if (op->request) {
252 free_xml(op->request);
253 op->request = NULL;
254 }
255 if (op->devices_list) {
256 g_list_free_full(op->devices_list, free);
257 op->devices_list = NULL;
258 }
259 g_list_free_full(op->automatic_list, free);
260 g_list_free(op->duplicates);
261 free(op);
262 }
263
264 void
265 init_stonith_remote_op_hash_table(GHashTable **table)
266 {
267 if (*table == NULL) {
268 *table = pcmk__strkey_table(NULL, free_remote_op);
269 }
270 }
271
272
273
274
275
276
277
278
279
280 static const char *
281 op_requested_action(const remote_fencing_op_t *op)
282 {
283 return ((op->phase > st_phase_requested)? "reboot" : op->action);
284 }
285
286
287
288
289
290
291
292 static void
293 op_phase_off(remote_fencing_op_t *op)
294 {
295 crm_info("Remapping multiple-device reboot targeting %s to 'off' "
296 CRM_XS " id=%.8s", op->target, op->id);
297 op->phase = st_phase_off;
298
299
300
301
302 strcpy(op->action, "off");
303 }
304
305
306
307
308
309
310
311 static void
312 op_phase_on(remote_fencing_op_t *op)
313 {
314 GList *iter = NULL;
315
316 crm_info("Remapped 'off' targeting %s complete, "
317 "remapping to 'on' for %s " CRM_XS " id=%.8s",
318 op->target, op->client_name, op->id);
319 op->phase = st_phase_on;
320 strcpy(op->action, "on");
321
322
323
324
325 for (iter = op->automatic_list; iter != NULL; iter = iter->next) {
326 GList *match = g_list_find_custom(op->devices_list, iter->data,
327 sort_strings);
328
329 if (match) {
330 op->devices_list = g_list_remove(op->devices_list, match->data);
331 }
332 }
333 g_list_free_full(op->automatic_list, free);
334 op->automatic_list = NULL;
335
336
337 op->devices = op->devices_list;
338 }
339
340
341
342
343
344
345
346 static void
347 undo_op_remap(remote_fencing_op_t *op)
348 {
349 if (op->phase > 0) {
350 crm_info("Undoing remap of reboot targeting %s for %s "
351 CRM_XS " id=%.8s", op->target, op->client_name, op->id);
352 op->phase = st_phase_requested;
353 strcpy(op->action, "reboot");
354 }
355 }
356
357 static xmlNode *
358 create_op_done_notify(remote_fencing_op_t * op, int rc)
359 {
360 xmlNode *notify_data = create_xml_node(NULL, T_STONITH_NOTIFY_FENCE);
361
362 crm_xml_add_int(notify_data, "state", op->state);
363 crm_xml_add_int(notify_data, F_STONITH_RC, rc);
364 crm_xml_add(notify_data, F_STONITH_TARGET, op->target);
365 crm_xml_add(notify_data, F_STONITH_ACTION, op->action);
366 crm_xml_add(notify_data, F_STONITH_DELEGATE, op->delegate);
367 crm_xml_add(notify_data, F_STONITH_REMOTE_OP_ID, op->id);
368 crm_xml_add(notify_data, F_STONITH_ORIGIN, op->originator);
369 crm_xml_add(notify_data, F_STONITH_CLIENTID, op->client_id);
370 crm_xml_add(notify_data, F_STONITH_CLIENTNAME, op->client_name);
371
372 return notify_data;
373 }
374
375 void
376 stonith_bcast_result_to_peers(remote_fencing_op_t * op, int rc, gboolean op_merged)
377 {
378 static int count = 0;
379 xmlNode *bcast = create_xml_node(NULL, T_STONITH_REPLY);
380 xmlNode *notify_data = create_op_done_notify(op, rc);
381
382 count++;
383 crm_trace("Broadcasting result to peers");
384 crm_xml_add(bcast, F_TYPE, T_STONITH_NOTIFY);
385 crm_xml_add(bcast, F_SUBTYPE, "broadcast");
386 crm_xml_add(bcast, F_STONITH_OPERATION, T_STONITH_NOTIFY);
387 crm_xml_add_int(bcast, "count", count);
388
389 if (op_merged) {
390 crm_xml_add(bcast, F_STONITH_MERGED, "true");
391 }
392
393 add_message_xml(bcast, F_STONITH_CALLDATA, notify_data);
394 send_cluster_message(NULL, crm_msg_stonith_ng, bcast, FALSE);
395 free_xml(notify_data);
396 free_xml(bcast);
397
398 return;
399 }
400
401 static void
402 handle_local_reply_and_notify(remote_fencing_op_t * op, xmlNode * data, int rc)
403 {
404 xmlNode *notify_data = NULL;
405 xmlNode *reply = NULL;
406
407 if (op->notify_sent == TRUE) {
408
409 return;
410 }
411
412
413 notify_data = create_op_done_notify(op, rc);
414 crm_xml_add_int(data, "state", op->state);
415 crm_xml_add(data, F_STONITH_TARGET, op->target);
416 crm_xml_add(data, F_STONITH_OPERATION, op->action);
417
418 reply = stonith_construct_reply(op->request, NULL, data, rc);
419 crm_xml_add(reply, F_STONITH_DELEGATE, op->delegate);
420
421
422 do_local_reply(reply, op->client_id, op->call_options & st_opt_sync_call, FALSE);
423
424
425 do_stonith_notify(0, T_STONITH_NOTIFY_FENCE, rc, notify_data);
426 do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
427
428
429 op->notify_sent = TRUE;
430 free_xml(reply);
431 free_xml(notify_data);
432 }
433
434 static void
435 handle_duplicates(remote_fencing_op_t * op, xmlNode * data, int rc)
436 {
437 GList *iter = NULL;
438
439 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
440 remote_fencing_op_t *other = iter->data;
441
442 if (other->state == st_duplicate) {
443 other->state = op->state;
444 crm_debug("Performing duplicate notification for %s@%s: %s "
445 CRM_XS " id=%.8s",
446 other->client_name, other->originator,
447 pcmk_strerror(rc), other->id);
448 remote_op_done(other, data, rc, TRUE);
449
450 } else {
451
452 crm_err("Skipping duplicate notification for %s@%s "
453 CRM_XS " state=%s id=%.8s",
454 other->client_name, other->originator,
455 stonith_op_state_str(other->state), other->id);
456 }
457 }
458 }
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485 static void
486 remote_op_done(remote_fencing_op_t * op, xmlNode * data, int rc, int dup)
487 {
488 int level = LOG_ERR;
489 const char *subt = NULL;
490 xmlNode *local_data = NULL;
491 gboolean op_merged = FALSE;
492
493 op->completed = time(NULL);
494 clear_remote_op_timers(op);
495 undo_op_remap(op);
496
497 if (op->notify_sent == TRUE) {
498 crm_err("Already sent notifications for '%s' targeting %s by %s for "
499 "client %s@%s: %s " CRM_XS " rc=%d state=%s id=%.8s",
500 op->action, op->target,
501 (op->delegate? op->delegate : "unknown node"),
502 op->client_name, op->originator, pcmk_strerror(rc),
503 rc, stonith_op_state_str(op->state), op->id);
504 goto remote_op_done_cleanup;
505 }
506
507 if (!op->delegate && data && rc != -ENODEV && rc != -EHOSTUNREACH) {
508 xmlNode *ndata = get_xpath_object("//@" F_STONITH_DELEGATE, data,
509 LOG_NEVER);
510 if(ndata) {
511 op->delegate = crm_element_value_copy(ndata, F_STONITH_DELEGATE);
512 } else {
513 op->delegate = crm_element_value_copy(data, F_ORIG);
514 }
515 }
516
517 if (data == NULL) {
518 data = create_xml_node(NULL, "remote-op");
519 local_data = data;
520 }
521
522 if(dup) {
523 op_merged = TRUE;
524 } else if (crm_element_value(data, F_STONITH_MERGED)) {
525 op_merged = TRUE;
526 }
527
528
529
530
531 subt = crm_element_value(data, F_SUBTYPE);
532 if (dup == FALSE && !pcmk__str_eq(subt, "broadcast", pcmk__str_casei)) {
533
534 stonith_bcast_result_to_peers(op, rc, (op_merged? TRUE: FALSE));
535 goto remote_op_done_cleanup;
536 }
537
538 if (rc == pcmk_ok || dup) {
539 level = LOG_NOTICE;
540 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
541 level = LOG_NOTICE;
542 }
543
544 do_crm_log(level, "Operation '%s'%s%s by %s for %s@%s%s: %s "
545 CRM_XS " id=%.8s", op->action, (op->target? " targeting " : ""),
546 (op->target? op->target : ""),
547 (op->delegate? op->delegate : "unknown node"),
548 op->client_name, op->originator,
549 (op_merged? " (merged)" : ""), pcmk_strerror(rc), op->id);
550
551 handle_local_reply_and_notify(op, data, rc);
552
553 if (dup == FALSE) {
554 handle_duplicates(op, data, rc);
555 }
556
557
558
559
560 if (op->query_results) {
561 g_list_free_full(op->query_results, free_remote_query);
562 op->query_results = NULL;
563 }
564
565 if (op->request) {
566 free_xml(op->request);
567 op->request = NULL;
568 }
569
570 remote_op_done_cleanup:
571 free_xml(local_data);
572 }
573
574 static gboolean
575 remote_op_watchdog_done(gpointer userdata)
576 {
577 remote_fencing_op_t *op = userdata;
578
579 op->op_timer_one = 0;
580
581 crm_notice("Self-fencing (%s) by %s for %s assumed complete "
582 CRM_XS " id=%.8s",
583 op->action, op->target, op->client_name, op->id);
584 op->state = st_done;
585 remote_op_done(op, NULL, pcmk_ok, FALSE);
586 return FALSE;
587 }
588
589 static gboolean
590 remote_op_timeout_one(gpointer userdata)
591 {
592 remote_fencing_op_t *op = userdata;
593
594 op->op_timer_one = 0;
595
596 crm_notice("Peer's '%s' action targeting %s for client %s timed out " CRM_XS
597 " id=%.8s", op->action, op->target, op->client_name, op->id);
598 call_remote_stonith(op, NULL, pcmk_ok);
599 return FALSE;
600 }
601
602 static gboolean
603 remote_op_timeout(gpointer userdata)
604 {
605 remote_fencing_op_t *op = userdata;
606
607 op->op_timer_total = 0;
608
609 if (op->state == st_done) {
610 crm_debug("Action '%s' targeting %s for client %s already completed "
611 CRM_XS " id=%.8s",
612 op->action, op->target, op->client_name, op->id);
613 return FALSE;
614 }
615
616 crm_debug("Action '%s' targeting %s for client %s timed out "
617 CRM_XS " id=%.8s",
618 op->action, op->target, op->client_name, op->id);
619
620 if (op->phase == st_phase_on) {
621
622
623
624
625 remote_op_done(op, NULL, pcmk_ok, FALSE);
626 return FALSE;
627 }
628
629 op->state = st_failed;
630
631 remote_op_done(op, NULL, -ETIME, FALSE);
632
633 return FALSE;
634 }
635
636 static gboolean
637 remote_op_query_timeout(gpointer data)
638 {
639 remote_fencing_op_t *op = data;
640
641 op->query_timer = 0;
642 if (op->state == st_done) {
643 crm_debug("Operation %.8s targeting %s already completed",
644 op->id, op->target);
645 } else if (op->state == st_exec) {
646 crm_debug("Operation %.8s targeting %s already in progress",
647 op->id, op->target);
648 } else if (op->query_results) {
649 crm_debug("Query %.8s targeting %s complete (state=%s)",
650 op->id, op->target, stonith_op_state_str(op->state));
651 call_remote_stonith(op, NULL, pcmk_ok);
652 } else {
653 crm_debug("Query %.8s targeting %s timed out (state=%s)",
654 op->id, op->target, stonith_op_state_str(op->state));
655 if (op->op_timer_total) {
656 g_source_remove(op->op_timer_total);
657 op->op_timer_total = 0;
658 }
659 remote_op_timeout(op);
660 }
661
662 return FALSE;
663 }
664
665 static gboolean
666 topology_is_empty(stonith_topology_t *tp)
667 {
668 int i;
669
670 if (tp == NULL) {
671 return TRUE;
672 }
673
674 for (i = 0; i < ST_LEVEL_MAX; i++) {
675 if (tp->levels[i] != NULL) {
676 return FALSE;
677 }
678 }
679 return TRUE;
680 }
681
682
683
684
685
686
687
688
689 static void
690 add_required_device(remote_fencing_op_t *op, const char *device)
691 {
692 GList *match = g_list_find_custom(op->automatic_list, device,
693 sort_strings);
694
695 if (!match) {
696 op->automatic_list = g_list_prepend(op->automatic_list, strdup(device));
697 }
698 }
699
700
701
702
703
704
705
706
707 static void
708 remove_required_device(remote_fencing_op_t *op, const char *device)
709 {
710 GList *match = g_list_find_custom(op->automatic_list, device,
711 sort_strings);
712
713 if (match) {
714 op->automatic_list = g_list_remove(op->automatic_list, match->data);
715 }
716 }
717
718
719 static void
720 set_op_device_list(remote_fencing_op_t * op, GList *devices)
721 {
722 GList *lpc = NULL;
723
724 if (op->devices_list) {
725 g_list_free_full(op->devices_list, free);
726 op->devices_list = NULL;
727 }
728 for (lpc = devices; lpc != NULL; lpc = lpc->next) {
729 op->devices_list = g_list_append(op->devices_list, strdup(lpc->data));
730 }
731 op->devices = op->devices_list;
732 }
733
734
735
736
737
738
739
740
741
742
743 static gboolean
744 topology_matches(const stonith_topology_t *tp, const char *node)
745 {
746 regex_t r_patt;
747
748 CRM_CHECK(node && tp && tp->target, return FALSE);
749 switch(tp->kind) {
750 case 2:
751
752
753
754
755
756
757 if (node_has_attr(node, tp->target_attribute, tp->target_value)) {
758 crm_notice("Matched %s with %s by attribute", node, tp->target);
759 return TRUE;
760 }
761 break;
762 case 1:
763
764
765
766
767 if (regcomp(&r_patt, tp->target_pattern, REG_EXTENDED|REG_NOSUB)) {
768 crm_info("Bad regex '%s' for fencing level", tp->target);
769 } else {
770 int status = regexec(&r_patt, node, 0, NULL, 0);
771
772 regfree(&r_patt);
773 if (status == 0) {
774 crm_notice("Matched %s with %s by name", node, tp->target);
775 return TRUE;
776 }
777 }
778 break;
779 case 0:
780 crm_trace("Testing %s against %s", node, tp->target);
781 return pcmk__str_eq(tp->target, node, pcmk__str_casei);
782 }
783 crm_trace("No match for %s with %s", node, tp->target);
784 return FALSE;
785 }
786
787 stonith_topology_t *
788 find_topology_for_host(const char *host)
789 {
790 GHashTableIter tIter;
791 stonith_topology_t *tp = g_hash_table_lookup(topology, host);
792
793 if(tp != NULL) {
794 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
795 return tp;
796 }
797
798 g_hash_table_iter_init(&tIter, topology);
799 while (g_hash_table_iter_next(&tIter, NULL, (gpointer *) & tp)) {
800 if (topology_matches(tp, host)) {
801 crm_trace("Found %s for %s in %d entries", tp->target, host, g_hash_table_size(topology));
802 return tp;
803 }
804 }
805
806 crm_trace("No matches for %s in %d topology entries", host, g_hash_table_size(topology));
807 return NULL;
808 }
809
810
811
812
813
814
815
816
817
818
819
820
821 static int
822 advance_topology_level(remote_fencing_op_t *op, bool empty_ok)
823 {
824 stonith_topology_t *tp = NULL;
825
826 if (op->target) {
827 tp = find_topology_for_host(op->target);
828 }
829 if (topology_is_empty(tp)) {
830 return empty_ok? pcmk_rc_ok : ENODEV;
831 }
832
833 CRM_ASSERT(tp->levels != NULL);
834
835 stonith__set_call_options(op->call_options, op->id, st_opt_topology);
836
837
838 undo_op_remap(op);
839
840 do {
841 op->level++;
842
843 } while (op->level < ST_LEVEL_MAX && tp->levels[op->level] == NULL);
844
845 if (op->level < ST_LEVEL_MAX) {
846 crm_trace("Attempting fencing level %d targeting %s (%d devices) "
847 "for client %s@%s (id=%.8s)",
848 op->level, op->target, g_list_length(tp->levels[op->level]),
849 op->client_name, op->originator, op->id);
850 set_op_device_list(op, tp->levels[op->level]);
851
852
853 if (op->level > 1 && op->delay > 0) {
854 op->delay = 0;
855 }
856
857 if (g_list_next(op->devices_list) && pcmk__str_eq(op->action, "reboot", pcmk__str_casei)) {
858
859
860
861
862
863 op_phase_off(op);
864 }
865 return pcmk_rc_ok;
866 }
867
868 crm_notice("All fencing options targeting %s for client %s@%s failed "
869 CRM_XS " id=%.8s",
870 op->target, op->client_name, op->originator, op->id);
871 return ENODEV;
872 }
873
874
875
876
877
878
879 static void
880 merge_duplicates(remote_fencing_op_t * op)
881 {
882 GHashTableIter iter;
883 remote_fencing_op_t *other = NULL;
884
885 time_t now = time(NULL);
886
887 g_hash_table_iter_init(&iter, stonith_remote_op_list);
888 while (g_hash_table_iter_next(&iter, NULL, (void **)&other)) {
889 const char *other_action = op_requested_action(other);
890
891 if (!strcmp(op->id, other->id)) {
892 continue;
893 }
894 if (other->state > st_exec) {
895 crm_trace("%.8s not duplicate of %.8s: not in progress",
896 op->id, other->id);
897 continue;
898 }
899 if (!pcmk__str_eq(op->target, other->target, pcmk__str_casei)) {
900 crm_trace("%.8s not duplicate of %.8s: node %s vs. %s",
901 op->id, other->id, op->target, other->target);
902 continue;
903 }
904 if (!pcmk__str_eq(op->action, other_action, pcmk__str_casei)) {
905 crm_trace("%.8s not duplicate of %.8s: action %s vs. %s",
906 op->id, other->id, op->action, other_action);
907 continue;
908 }
909 if (pcmk__str_eq(op->client_name, other->client_name, pcmk__str_casei)) {
910 crm_trace("%.8s not duplicate of %.8s: same client %s",
911 op->id, other->id, op->client_name);
912 continue;
913 }
914 if (pcmk__str_eq(other->target, other->originator, pcmk__str_casei)) {
915 crm_trace("%.8s not duplicate of %.8s: suicide for %s",
916 op->id, other->id, other->target);
917 continue;
918 }
919 if (!fencing_peer_active(crm_get_peer(0, other->originator))) {
920 crm_notice("Failing action '%s' targeting %s originating from "
921 "client %s@%s: Originator is dead " CRM_XS " id=%.8s",
922 other->action, other->target, other->client_name,
923 other->originator, other->id);
924 crm_trace("%.8s not duplicate of %.8s: originator dead",
925 op->id, other->id);
926 other->state = st_failed;
927 continue;
928 }
929 if ((other->total_timeout > 0)
930 && (now > (other->total_timeout + other->created))) {
931 crm_trace("%.8s not duplicate of %.8s: old (%ld vs. %ld + %d)",
932 op->id, other->id, now, other->created,
933 other->total_timeout);
934 continue;
935 }
936
937
938
939
940 other->duplicates = g_list_append(other->duplicates, op);
941 if (other->total_timeout == 0) {
942 other->total_timeout = op->total_timeout =
943 TIMEOUT_MULTIPLY_FACTOR * get_op_total_timeout(op, NULL);
944 crm_trace("Best guess as to timeout used for %.8s: %d",
945 other->id, other->total_timeout);
946 }
947 crm_notice("Merging fencing action '%s' targeting %s originating from "
948 "client %s with identical request from %s@%s "
949 CRM_XS " original=%.8s duplicate=%.8s total_timeout=%ds",
950 op->action, op->target, op->client_name,
951 other->client_name, other->originator,
952 op->id, other->id, other->total_timeout);
953 report_timeout_period(op, other->total_timeout);
954 op->state = st_duplicate;
955 }
956 }
957
958 static uint32_t fencing_active_peers(void)
959 {
960 uint32_t count = 0;
961 crm_node_t *entry;
962 GHashTableIter gIter;
963
964 g_hash_table_iter_init(&gIter, crm_peer_cache);
965 while (g_hash_table_iter_next(&gIter, NULL, (void **)&entry)) {
966 if(fencing_peer_active(entry)) {
967 count++;
968 }
969 }
970 return count;
971 }
972
973 int
974 stonith_manual_ack(xmlNode * msg, remote_fencing_op_t * op)
975 {
976 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, msg, LOG_ERR);
977
978 op->state = st_done;
979 op->completed = time(NULL);
980 op->delegate = strdup("a human");
981
982 crm_notice("Injecting manual confirmation that %s is safely off/down",
983 crm_element_value(dev, F_STONITH_TARGET));
984
985 remote_op_done(op, msg, pcmk_ok, FALSE);
986
987
988 return -EINPROGRESS;
989 }
990
991
992
993
994
995
996
997
998
999
1000
1001
1002 void *
1003 create_remote_stonith_op(const char *client, xmlNode * request, gboolean peer)
1004 {
1005 remote_fencing_op_t *op = NULL;
1006 xmlNode *dev = get_xpath_object("//@" F_STONITH_TARGET, request, LOG_NEVER);
1007 int call_options = 0;
1008 const char *operation = NULL;
1009
1010 init_stonith_remote_op_hash_table(&stonith_remote_op_list);
1011
1012
1013
1014 if (peer && dev) {
1015 const char *op_id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1016
1017 CRM_CHECK(op_id != NULL, return NULL);
1018
1019 op = g_hash_table_lookup(stonith_remote_op_list, op_id);
1020 if (op) {
1021 crm_debug("Reusing existing remote fencing op %.8s for %s",
1022 op_id, ((client == NULL)? "unknown client" : client));
1023 return op;
1024 }
1025 }
1026
1027 op = calloc(1, sizeof(remote_fencing_op_t));
1028 CRM_ASSERT(op != NULL);
1029
1030 crm_element_value_int(request, F_STONITH_TIMEOUT, &(op->base_timeout));
1031
1032 crm_element_value_int(request, F_STONITH_DELAY, &(op->delay));
1033
1034 if (peer && dev) {
1035 op->id = crm_element_value_copy(dev, F_STONITH_REMOTE_OP_ID);
1036 } else {
1037 op->id = crm_generate_uuid();
1038 }
1039
1040 g_hash_table_replace(stonith_remote_op_list, op->id, op);
1041
1042 op->state = st_query;
1043 op->replies_expected = fencing_active_peers();
1044 op->action = crm_element_value_copy(dev, F_STONITH_ACTION);
1045 op->originator = crm_element_value_copy(dev, F_STONITH_ORIGIN);
1046 op->delegate = crm_element_value_copy(dev, F_STONITH_DELEGATE);
1047 op->created = time(NULL);
1048
1049 if (op->originator == NULL) {
1050
1051 op->originator = strdup(stonith_our_uname);
1052 }
1053
1054 CRM_LOG_ASSERT(client != NULL);
1055 if (client) {
1056 op->client_id = strdup(client);
1057 }
1058
1059
1060
1061 operation = crm_element_value(request, F_STONITH_OPERATION);
1062
1063 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1064 op->client_name = crm_strdup_printf("%s.%lu", crm_system_name,
1065 (unsigned long) getpid());
1066 } else {
1067 op->client_name = crm_element_value_copy(request, F_STONITH_CLIENTNAME);
1068 }
1069
1070 op->target = crm_element_value_copy(dev, F_STONITH_TARGET);
1071 op->request = copy_xml(request);
1072 crm_element_value_int(request, F_STONITH_CALLOPTS, &call_options);
1073 op->call_options = call_options;
1074
1075 crm_element_value_int(request, F_STONITH_CALLID, &(op->client_callid));
1076
1077 crm_trace("%s new fencing op %s ('%s' targeting %s for client %s, "
1078 "base timeout %d, %u %s expected)",
1079 (peer && dev)? "Recorded" : "Generated", op->id, op->action,
1080 op->target, op->client_name, op->base_timeout,
1081 op->replies_expected,
1082 pcmk__plural_alt(op->replies_expected, "reply", "replies"));
1083
1084 if (op->call_options & st_opt_cs_nodeid) {
1085 int nodeid;
1086 crm_node_t *node;
1087
1088 pcmk__scan_min_int(op->target, &nodeid, 0);
1089 node = pcmk__search_known_node_cache(nodeid, NULL, CRM_GET_PEER_ANY);
1090
1091
1092 stonith__clear_call_options(op->call_options, op->id, st_opt_cs_nodeid);
1093
1094 if (node && node->uname) {
1095 free(op->target);
1096 op->target = strdup(node->uname);
1097
1098 } else {
1099 crm_warn("Could not expand nodeid '%s' into a host name", op->target);
1100 }
1101 }
1102
1103
1104 merge_duplicates(op);
1105
1106 if (op->state != st_duplicate) {
1107
1108 do_stonith_notify(0, T_STONITH_NOTIFY_HISTORY, 0, NULL);
1109 }
1110
1111
1112 stonith_fence_history_trim();
1113
1114 return op;
1115 }
1116
1117 remote_fencing_op_t *
1118 initiate_remote_stonith_op(pcmk__client_t *client, xmlNode *request,
1119 gboolean manual_ack)
1120 {
1121 int query_timeout = 0;
1122 xmlNode *query = NULL;
1123 const char *client_id = NULL;
1124 remote_fencing_op_t *op = NULL;
1125 const char *relay_op_id = NULL;
1126 const char *operation = NULL;
1127
1128 if (client) {
1129 client_id = client->id;
1130 } else {
1131 client_id = crm_element_value(request, F_STONITH_CLIENTID);
1132 }
1133
1134 CRM_LOG_ASSERT(client_id != NULL);
1135 op = create_remote_stonith_op(client_id, request, FALSE);
1136 op->owner = TRUE;
1137 if (manual_ack) {
1138 crm_notice("Processing manual confirmation of fencing targeting %s "
1139 CRM_XS " id=%.8s", op->target, op->id);
1140 return op;
1141 }
1142
1143 CRM_CHECK(op->action, return NULL);
1144
1145 if (advance_topology_level(op, true) != pcmk_rc_ok) {
1146 op->state = st_failed;
1147 }
1148
1149 switch (op->state) {
1150 case st_failed:
1151 crm_warn("Could not request peer fencing (%s) targeting %s "
1152 CRM_XS " id=%.8s", op->action, op->target, op->id);
1153 remote_op_done(op, NULL, -EINVAL, FALSE);
1154 return op;
1155
1156 case st_duplicate:
1157 crm_info("Requesting peer fencing (%s) targeting %s (duplicate) "
1158 CRM_XS " id=%.8s", op->action, op->target, op->id);
1159 return op;
1160
1161 default:
1162 crm_notice("Requesting peer fencing (%s) targeting %s "
1163 CRM_XS " id=%.8s state=%s base_timeout=%d",
1164 op->action, op->target, op->id,
1165 stonith_op_state_str(op->state), op->base_timeout);
1166 }
1167
1168 query = stonith_create_op(op->client_callid, op->id, STONITH_OP_QUERY,
1169 NULL, op->call_options);
1170
1171 crm_xml_add(query, F_STONITH_REMOTE_OP_ID, op->id);
1172 crm_xml_add(query, F_STONITH_TARGET, op->target);
1173 crm_xml_add(query, F_STONITH_ACTION, op_requested_action(op));
1174 crm_xml_add(query, F_STONITH_ORIGIN, op->originator);
1175 crm_xml_add(query, F_STONITH_CLIENTID, op->client_id);
1176 crm_xml_add(query, F_STONITH_CLIENTNAME, op->client_name);
1177 crm_xml_add_int(query, F_STONITH_TIMEOUT, op->base_timeout);
1178
1179
1180 operation = crm_element_value(request, F_STONITH_OPERATION);
1181 if (pcmk__str_eq(operation, STONITH_OP_RELAY, pcmk__str_none)) {
1182 relay_op_id = crm_element_value(request, F_STONITH_REMOTE_OP_ID);
1183 if (relay_op_id) {
1184 crm_xml_add(query, F_STONITH_REMOTE_OP_ID_RELAY, relay_op_id);
1185 }
1186 }
1187
1188 send_cluster_message(NULL, crm_msg_stonith_ng, query, FALSE);
1189 free_xml(query);
1190
1191 query_timeout = op->base_timeout * TIMEOUT_MULTIPLY_FACTOR;
1192 op->query_timer = g_timeout_add((1000 * query_timeout), remote_op_query_timeout, op);
1193
1194 return op;
1195 }
1196
1197 enum find_best_peer_options {
1198
1199 FIND_PEER_SKIP_TARGET = 0x0001,
1200
1201 FIND_PEER_TARGET_ONLY = 0x0002,
1202
1203 FIND_PEER_VERIFIED_ONLY = 0x0004,
1204 };
1205
1206 static st_query_result_t *
1207 find_best_peer(const char *device, remote_fencing_op_t * op, enum find_best_peer_options options)
1208 {
1209 GList *iter = NULL;
1210 gboolean verified_devices_only = (options & FIND_PEER_VERIFIED_ONLY) ? TRUE : FALSE;
1211
1212 if (!device && pcmk_is_set(op->call_options, st_opt_topology)) {
1213 return NULL;
1214 }
1215
1216 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1217 st_query_result_t *peer = iter->data;
1218
1219 crm_trace("Testing result from %s targeting %s with %d device%s: %d %x",
1220 peer->host, op->target, peer->ndevices,
1221 pcmk__plural_s(peer->ndevices), peer->tried, options);
1222 if ((options & FIND_PEER_SKIP_TARGET) && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1223 continue;
1224 }
1225 if ((options & FIND_PEER_TARGET_ONLY) && !pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1226 continue;
1227 }
1228
1229 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1230
1231 if (grab_peer_device(op, peer, device, verified_devices_only)) {
1232 return peer;
1233 }
1234
1235 } else if ((peer->tried == FALSE)
1236 && count_peer_devices(op, peer, verified_devices_only)) {
1237
1238
1239 crm_trace("Simple fencing");
1240 return peer;
1241 }
1242 }
1243
1244 return NULL;
1245 }
1246
1247 static st_query_result_t *
1248 stonith_choose_peer(remote_fencing_op_t * op)
1249 {
1250 const char *device = NULL;
1251 st_query_result_t *peer = NULL;
1252 uint32_t active = fencing_active_peers();
1253
1254 do {
1255 if (op->devices) {
1256 device = op->devices->data;
1257 crm_trace("Checking for someone to fence (%s) %s using %s",
1258 op->action, op->target, device);
1259 } else {
1260 crm_trace("Checking for someone to fence (%s) %s",
1261 op->action, op->target);
1262 }
1263
1264
1265 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET|FIND_PEER_VERIFIED_ONLY);
1266 if (peer) {
1267 crm_trace("Found verified peer %s for %s", peer->host, device?device:"<any>");
1268 return peer;
1269 }
1270
1271 if(op->query_timer != 0 && op->replies < QB_MIN(op->replies_expected, active)) {
1272 crm_trace("Waiting before looking for unverified devices to fence %s", op->target);
1273 return NULL;
1274 }
1275
1276
1277 peer = find_best_peer(device, op, FIND_PEER_SKIP_TARGET);
1278 if (peer) {
1279 crm_trace("Found best unverified peer %s", peer->host);
1280 return peer;
1281 }
1282
1283
1284
1285
1286 if (op->phase != st_phase_on) {
1287 peer = find_best_peer(device, op, FIND_PEER_TARGET_ONLY);
1288 if (peer) {
1289 crm_trace("%s will fence itself", peer->host);
1290 return peer;
1291 }
1292 }
1293
1294
1295
1296
1297 } while ((op->phase != st_phase_on)
1298 && pcmk_is_set(op->call_options, st_opt_topology)
1299 && (advance_topology_level(op, false) == pcmk_rc_ok));
1300
1301 crm_notice("Couldn't find anyone to fence (%s) %s using %s",
1302 op->action, op->target, (device? device : "any device"));
1303 return NULL;
1304 }
1305
1306 static int
1307 get_device_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer,
1308 const char *device)
1309 {
1310 device_properties_t *props;
1311
1312 if (!peer || !device) {
1313 return op->base_timeout;
1314 }
1315
1316 props = g_hash_table_lookup(peer->devices, device);
1317 if (!props) {
1318 return op->base_timeout;
1319 }
1320
1321 return (props->custom_action_timeout[op->phase]?
1322 props->custom_action_timeout[op->phase] : op->base_timeout)
1323 + props->delay_max[op->phase];
1324 }
1325
1326 struct timeout_data {
1327 const remote_fencing_op_t *op;
1328 const st_query_result_t *peer;
1329 int total_timeout;
1330 };
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340 static void
1341 add_device_timeout(gpointer key, gpointer value, gpointer user_data)
1342 {
1343 const char *device_id = key;
1344 device_properties_t *props = value;
1345 struct timeout_data *timeout = user_data;
1346
1347 if (!props->executed[timeout->op->phase]
1348 && !props->disallowed[timeout->op->phase]) {
1349 timeout->total_timeout += get_device_timeout(timeout->op,
1350 timeout->peer, device_id);
1351 }
1352 }
1353
1354 static int
1355 get_peer_timeout(const remote_fencing_op_t *op, const st_query_result_t *peer)
1356 {
1357 struct timeout_data timeout;
1358
1359 timeout.op = op;
1360 timeout.peer = peer;
1361 timeout.total_timeout = 0;
1362
1363 g_hash_table_foreach(peer->devices, add_device_timeout, &timeout);
1364
1365 return (timeout.total_timeout? timeout.total_timeout : op->base_timeout);
1366 }
1367
1368 static int
1369 get_op_total_timeout(const remote_fencing_op_t *op,
1370 const st_query_result_t *chosen_peer)
1371 {
1372 int total_timeout = 0;
1373 stonith_topology_t *tp = find_topology_for_host(op->target);
1374
1375 if (pcmk_is_set(op->call_options, st_opt_topology) && tp) {
1376 int i;
1377 GList *device_list = NULL;
1378 GList *iter = NULL;
1379
1380
1381
1382
1383
1384
1385
1386
1387 for (i = 0; i < ST_LEVEL_MAX; i++) {
1388 if (!tp->levels[i]) {
1389 continue;
1390 }
1391 for (device_list = tp->levels[i]; device_list; device_list = device_list->next) {
1392 for (iter = op->query_results; iter != NULL; iter = iter->next) {
1393 const st_query_result_t *peer = iter->data;
1394
1395 if (find_peer_device(op, peer, device_list->data)) {
1396 total_timeout += get_device_timeout(op, peer,
1397 device_list->data);
1398 break;
1399 }
1400 }
1401 }
1402 }
1403
1404 } else if (chosen_peer) {
1405 total_timeout = get_peer_timeout(op, chosen_peer);
1406 } else {
1407 total_timeout = op->base_timeout;
1408 }
1409
1410 return total_timeout ? total_timeout : op->base_timeout;
1411 }
1412
1413 static void
1414 report_timeout_period(remote_fencing_op_t * op, int op_timeout)
1415 {
1416 GList *iter = NULL;
1417 xmlNode *update = NULL;
1418 const char *client_node = NULL;
1419 const char *client_id = NULL;
1420 const char *call_id = NULL;
1421
1422 if (op->call_options & st_opt_sync_call) {
1423
1424
1425
1426
1427 return;
1428 } else if (!op->request) {
1429 return;
1430 }
1431
1432 crm_trace("Reporting timeout for %s (id=%.8s)", op->client_name, op->id);
1433 client_node = crm_element_value(op->request, F_STONITH_CLIENTNODE);
1434 call_id = crm_element_value(op->request, F_STONITH_CALLID);
1435 client_id = crm_element_value(op->request, F_STONITH_CLIENTID);
1436 if (!client_node || !call_id || !client_id) {
1437 return;
1438 }
1439
1440 if (pcmk__str_eq(client_node, stonith_our_uname, pcmk__str_casei)) {
1441
1442 do_stonith_async_timeout_update(client_id, call_id, op_timeout);
1443 return;
1444 }
1445
1446
1447 update = stonith_create_op(op->client_callid, op->id, STONITH_OP_TIMEOUT_UPDATE, NULL, 0);
1448 crm_xml_add(update, F_STONITH_REMOTE_OP_ID, op->id);
1449 crm_xml_add(update, F_STONITH_CLIENTID, client_id);
1450 crm_xml_add(update, F_STONITH_CALLID, call_id);
1451 crm_xml_add_int(update, F_STONITH_TIMEOUT, op_timeout);
1452
1453 send_cluster_message(crm_get_peer(0, client_node), crm_msg_stonith_ng, update, FALSE);
1454
1455 free_xml(update);
1456
1457 for (iter = op->duplicates; iter != NULL; iter = iter->next) {
1458 remote_fencing_op_t *dup = iter->data;
1459
1460 crm_trace("Reporting timeout for duplicate %.8s to client %s",
1461 dup->id, dup->client_name);
1462 report_timeout_period(iter->data, op_timeout);
1463 }
1464 }
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475 static void
1476 advance_topology_device_in_level(remote_fencing_op_t *op, const char *device,
1477 xmlNode *msg, int rc)
1478 {
1479
1480 if (op->devices) {
1481 op->devices = op->devices->next;
1482 }
1483
1484
1485 if ((op->phase == st_phase_requested) && pcmk__str_eq(op->action, "on", pcmk__str_casei)) {
1486
1487 remove_required_device(op, device);
1488
1489
1490
1491
1492 if (op->devices == NULL) {
1493 op->devices = op->automatic_list;
1494 }
1495 }
1496
1497 if ((op->devices == NULL) && (op->phase == st_phase_off)) {
1498
1499
1500
1501
1502 op_phase_on(op);
1503 }
1504
1505 if (op->devices) {
1506
1507 crm_trace("Next targeting %s on behalf of %s@%s (rc was %d)",
1508 op->target, op->client_name, op->originator, rc);
1509
1510
1511 if (op->delay > 0) {
1512 op->delay = 0;
1513 }
1514
1515 call_remote_stonith(op, NULL, pcmk_ok);
1516 } else {
1517
1518 crm_trace("Marking complex fencing op targeting %s as complete",
1519 op->target);
1520 op->state = st_done;
1521 remote_op_done(op, msg, rc, FALSE);
1522 }
1523 }
1524
1525 void
1526 call_remote_stonith(remote_fencing_op_t * op, st_query_result_t * peer, int rc)
1527 {
1528 const char *device = NULL;
1529 int timeout = op->base_timeout;
1530
1531 crm_trace("Action %.8s targeting %s for %s is %s",
1532 op->id, op->target, op->client_name,
1533 stonith_op_state_str(op->state));
1534 if ((peer == NULL) && !pcmk_is_set(op->call_options, st_opt_topology)) {
1535 peer = stonith_choose_peer(op);
1536 }
1537
1538 if (!op->op_timer_total) {
1539 int total_timeout = get_op_total_timeout(op, peer);
1540
1541 op->total_timeout = TIMEOUT_MULTIPLY_FACTOR * total_timeout;
1542 op->op_timer_total = g_timeout_add(1000 * op->total_timeout, remote_op_timeout, op);
1543 report_timeout_period(op, op->total_timeout);
1544 crm_info("Total timeout set to %d for peer's fencing targeting %s for %s"
1545 CRM_XS "id=%.8s",
1546 total_timeout, op->target, op->client_name, op->id);
1547 }
1548
1549 if (pcmk_is_set(op->call_options, st_opt_topology) && op->devices) {
1550
1551
1552
1553 peer = stonith_choose_peer(op);
1554
1555 device = op->devices->data;
1556 timeout = get_device_timeout(op, peer, device);
1557 }
1558
1559 if (peer) {
1560 int timeout_one = 0;
1561 xmlNode *remote_op = stonith_create_op(op->client_callid, op->id, STONITH_OP_FENCE, NULL, 0);
1562
1563 crm_xml_add(remote_op, F_STONITH_REMOTE_OP_ID, op->id);
1564 crm_xml_add(remote_op, F_STONITH_TARGET, op->target);
1565 crm_xml_add(remote_op, F_STONITH_ACTION, op->action);
1566 crm_xml_add(remote_op, F_STONITH_ORIGIN, op->originator);
1567 crm_xml_add(remote_op, F_STONITH_CLIENTID, op->client_id);
1568 crm_xml_add(remote_op, F_STONITH_CLIENTNAME, op->client_name);
1569 crm_xml_add_int(remote_op, F_STONITH_TIMEOUT, timeout);
1570 crm_xml_add_int(remote_op, F_STONITH_CALLOPTS, op->call_options);
1571 crm_xml_add_int(remote_op, F_STONITH_DELAY, op->delay);
1572
1573 if (device) {
1574 timeout_one = TIMEOUT_MULTIPLY_FACTOR *
1575 get_device_timeout(op, peer, device);
1576 crm_notice("Requesting that %s perform '%s' action targeting %s "
1577 "using %s " CRM_XS " for client %s (%ds)",
1578 peer->host, op->action, op->target, device,
1579 op->client_name, timeout_one);
1580 crm_xml_add(remote_op, F_STONITH_DEVICE, device);
1581
1582 } else {
1583 timeout_one = TIMEOUT_MULTIPLY_FACTOR * get_peer_timeout(op, peer);
1584 crm_notice("Requesting that %s perform '%s' action targeting %s "
1585 CRM_XS " for client %s (%ds, %lds)",
1586 peer->host, op->action, op->target, op->client_name,
1587 timeout_one, stonith_watchdog_timeout_ms);
1588 }
1589
1590 op->state = st_exec;
1591 if (op->op_timer_one) {
1592 g_source_remove(op->op_timer_one);
1593 }
1594
1595 if(stonith_watchdog_timeout_ms > 0 && device && pcmk__str_eq(device, "watchdog", pcmk__str_casei)) {
1596 crm_notice("Waiting %lds for %s to self-fence (%s) for client %s "
1597 CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000),
1598 op->target, op->action, op->client_name, op->id);
1599 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1600
1601
1602 } else if(stonith_watchdog_timeout_ms > 0
1603 && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)
1604 && !pcmk__str_eq(op->action, "on", pcmk__str_casei)) {
1605 crm_notice("Waiting %lds for %s to self-fence (%s) for client %s "
1606 CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000),
1607 op->target, op->action, op->client_name, op->id);
1608 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1609
1610 } else {
1611 op->op_timer_one = g_timeout_add((1000 * timeout_one), remote_op_timeout_one, op);
1612 }
1613
1614
1615 send_cluster_message(crm_get_peer(0, peer->host), crm_msg_stonith_ng, remote_op, FALSE);
1616 peer->tried = TRUE;
1617 free_xml(remote_op);
1618 return;
1619
1620 } else if (op->phase == st_phase_on) {
1621
1622
1623
1624 crm_warn("Ignoring %s 'on' failure (no capable peers) targeting %s "
1625 "after successful 'off'", device, op->target);
1626 advance_topology_device_in_level(op, device, NULL, pcmk_ok);
1627 return;
1628
1629 } else if (op->owner == FALSE) {
1630 crm_err("Fencing (%s) targeting %s for client %s is not ours to control",
1631 op->action, op->target, op->client_name);
1632
1633 } else if (op->query_timer == 0) {
1634
1635 crm_info("No remaining peers capable of fencing (%s) %s for client %s "
1636 CRM_XS " state=%s", op->action, op->target, op->client_name,
1637 stonith_op_state_str(op->state));
1638 CRM_LOG_ASSERT(op->state < st_done);
1639 remote_op_timeout(op);
1640
1641 } else if(op->replies >= op->replies_expected || op->replies >= fencing_active_peers()) {
1642
1643
1644
1645
1646
1647
1648 if(stonith_watchdog_timeout_ms && pcmk__str_eq(device, "watchdog", pcmk__str_null_matches | pcmk__str_casei)) {
1649 crm_notice("Waiting %lds for %s to self-fence (%s) for client %s "
1650 CRM_XS " id=%.8s", (stonith_watchdog_timeout_ms / 1000),
1651 op->target, op->action, op->client_name, op->id);
1652 op->op_timer_one = g_timeout_add(stonith_watchdog_timeout_ms, remote_op_watchdog_done, op);
1653 return;
1654 }
1655
1656 if (op->state == st_query) {
1657 crm_info("No peers (out of %d) have devices capable of fencing "
1658 "(%s) %s for client %s " CRM_XS " state=%s",
1659 op->replies, op->action, op->target, op->client_name,
1660 stonith_op_state_str(op->state));
1661
1662 rc = -ENODEV;
1663 } else {
1664 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1665 rc = -EHOSTUNREACH;
1666 }
1667
1668 crm_info("No peers (out of %d) are capable of fencing (%s) %s "
1669 "for client %s " CRM_XS " state=%s",
1670 op->replies, op->action, op->target, op->client_name,
1671 stonith_op_state_str(op->state));
1672 }
1673
1674 op->state = st_failed;
1675 remote_op_done(op, NULL, rc, FALSE);
1676
1677 } else {
1678 crm_info("Waiting for additional peers capable of fencing (%s) %s%s%s "
1679 "for client %s " CRM_XS " id=%.8s",
1680 op->action, op->target, (device? " using " : ""),
1681 (device? device : ""), op->client_name, op->id);
1682 }
1683 }
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696 static gint
1697 sort_peers(gconstpointer a, gconstpointer b)
1698 {
1699 const st_query_result_t *peer_a = a;
1700 const st_query_result_t *peer_b = b;
1701
1702 return (peer_b->ndevices - peer_a->ndevices);
1703 }
1704
1705
1706
1707
1708
1709 static gboolean
1710 all_topology_devices_found(remote_fencing_op_t * op)
1711 {
1712 GList *device = NULL;
1713 GList *iter = NULL;
1714 device_properties_t *match = NULL;
1715 stonith_topology_t *tp = NULL;
1716 gboolean skip_target = FALSE;
1717 int i;
1718
1719 tp = find_topology_for_host(op->target);
1720 if (!tp) {
1721 return FALSE;
1722 }
1723 if (pcmk__strcase_any_of(op->action, "off", "reboot", NULL)) {
1724
1725
1726 skip_target = TRUE;
1727 }
1728
1729 for (i = 0; i < ST_LEVEL_MAX; i++) {
1730 for (device = tp->levels[i]; device; device = device->next) {
1731 match = NULL;
1732 for (iter = op->query_results; iter && !match; iter = iter->next) {
1733 st_query_result_t *peer = iter->data;
1734
1735 if (skip_target && pcmk__str_eq(peer->host, op->target, pcmk__str_casei)) {
1736 continue;
1737 }
1738 match = find_peer_device(op, peer, device->data);
1739 }
1740 if (!match) {
1741 return FALSE;
1742 }
1743 }
1744 }
1745
1746 return TRUE;
1747 }
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760 static void
1761 parse_action_specific(xmlNode *xml, const char *peer, const char *device,
1762 const char *action, remote_fencing_op_t *op,
1763 enum st_remap_phase phase, device_properties_t *props)
1764 {
1765 props->custom_action_timeout[phase] = 0;
1766 crm_element_value_int(xml, F_STONITH_ACTION_TIMEOUT,
1767 &props->custom_action_timeout[phase]);
1768 if (props->custom_action_timeout[phase]) {
1769 crm_trace("Peer %s with device %s returned %s action timeout %d",
1770 peer, device, action, props->custom_action_timeout[phase]);
1771 }
1772
1773 props->delay_max[phase] = 0;
1774 crm_element_value_int(xml, F_STONITH_DELAY_MAX, &props->delay_max[phase]);
1775 if (props->delay_max[phase]) {
1776 crm_trace("Peer %s with device %s returned maximum of random delay %d for %s",
1777 peer, device, props->delay_max[phase], action);
1778 }
1779
1780 props->delay_base[phase] = 0;
1781 crm_element_value_int(xml, F_STONITH_DELAY_BASE, &props->delay_base[phase]);
1782 if (props->delay_base[phase]) {
1783 crm_trace("Peer %s with device %s returned base delay %d for %s",
1784 peer, device, props->delay_base[phase], action);
1785 }
1786
1787
1788 if (pcmk__str_eq(action, "on", pcmk__str_casei)) {
1789 int required = 0;
1790
1791 crm_element_value_int(xml, F_STONITH_DEVICE_REQUIRED, &required);
1792 if (required) {
1793 crm_trace("Peer %s requires device %s to execute for action %s",
1794 peer, device, action);
1795 add_required_device(op, device);
1796 }
1797 }
1798
1799
1800
1801
1802 if (crm_is_true(crm_element_value(xml, F_STONITH_ACTION_DISALLOWED))) {
1803 props->disallowed[phase] = TRUE;
1804 crm_trace("Peer %s is disallowed from executing %s for device %s",
1805 peer, action, device);
1806 }
1807 }
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818 static void
1819 add_device_properties(xmlNode *xml, remote_fencing_op_t *op,
1820 st_query_result_t *result, const char *device)
1821 {
1822 xmlNode *child;
1823 int verified = 0;
1824 device_properties_t *props = calloc(1, sizeof(device_properties_t));
1825
1826
1827 CRM_ASSERT(props != NULL);
1828 g_hash_table_insert(result->devices, strdup(device), props);
1829
1830
1831 crm_element_value_int(xml, F_STONITH_DEVICE_VERIFIED, &verified);
1832 if (verified) {
1833 crm_trace("Peer %s has confirmed a verified device %s",
1834 result->host, device);
1835 props->verified = TRUE;
1836 }
1837
1838
1839 parse_action_specific(xml, result->host, device, op_requested_action(op),
1840 op, st_phase_requested, props);
1841 for (child = pcmk__xml_first_child(xml); child != NULL;
1842 child = pcmk__xml_next(child)) {
1843
1844
1845
1846
1847 if (pcmk__str_eq(ID(child), "off", pcmk__str_casei)) {
1848 parse_action_specific(child, result->host, device, "off",
1849 op, st_phase_off, props);
1850 } else if (pcmk__str_eq(ID(child), "on", pcmk__str_casei)) {
1851 parse_action_specific(child, result->host, device, "on",
1852 op, st_phase_on, props);
1853 }
1854 }
1855 }
1856
1857
1858
1859
1860
1861
1862
1863
1864
1865
1866
1867
1868 static st_query_result_t *
1869 add_result(remote_fencing_op_t *op, const char *host, int ndevices, xmlNode *xml)
1870 {
1871 st_query_result_t *result = calloc(1, sizeof(st_query_result_t));
1872 xmlNode *child;
1873
1874
1875
1876 CRM_CHECK(result != NULL, return NULL);
1877 result->host = strdup(host);
1878 result->devices = pcmk__strkey_table(free, free);
1879
1880
1881 for (child = pcmk__xml_first_child(xml); child != NULL;
1882 child = pcmk__xml_next(child)) {
1883 const char *device = ID(child);
1884
1885 if (device) {
1886 add_device_properties(child, op, result, device);
1887 }
1888 }
1889
1890 result->ndevices = g_hash_table_size(result->devices);
1891 CRM_CHECK(ndevices == result->ndevices,
1892 crm_err("Query claimed to have %d device%s but %d found",
1893 ndevices, pcmk__plural_s(ndevices), result->ndevices));
1894
1895 op->query_results = g_list_insert_sorted(op->query_results, result, sort_peers);
1896 return result;
1897 }
1898
1899
1900
1901
1902
1903
1904
1905
1906
1907
1908
1909
1910
1911
1912
1913 int
1914 process_remote_stonith_query(xmlNode * msg)
1915 {
1916 int ndevices = 0;
1917 gboolean host_is_target = FALSE;
1918 gboolean have_all_replies = FALSE;
1919 const char *id = NULL;
1920 const char *host = NULL;
1921 remote_fencing_op_t *op = NULL;
1922 st_query_result_t *result = NULL;
1923 uint32_t replies_expected;
1924 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
1925
1926 CRM_CHECK(dev != NULL, return -EPROTO);
1927
1928 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
1929 CRM_CHECK(id != NULL, return -EPROTO);
1930
1931 dev = get_xpath_object("//@" F_STONITH_AVAILABLE_DEVICES, msg, LOG_ERR);
1932 CRM_CHECK(dev != NULL, return -EPROTO);
1933 crm_element_value_int(dev, F_STONITH_AVAILABLE_DEVICES, &ndevices);
1934
1935 op = g_hash_table_lookup(stonith_remote_op_list, id);
1936 if (op == NULL) {
1937 crm_debug("Received query reply for unknown or expired operation %s",
1938 id);
1939 return -EOPNOTSUPP;
1940 }
1941
1942 replies_expected = fencing_active_peers();
1943 if (op->replies_expected < replies_expected) {
1944 replies_expected = op->replies_expected;
1945 }
1946 if ((++op->replies >= replies_expected) && (op->state == st_query)) {
1947 have_all_replies = TRUE;
1948 }
1949 host = crm_element_value(msg, F_ORIG);
1950 host_is_target = pcmk__str_eq(host, op->target, pcmk__str_casei);
1951
1952 crm_info("Query result %d of %d from %s for %s/%s (%d device%s) %s",
1953 op->replies, replies_expected, host,
1954 op->target, op->action, ndevices, pcmk__plural_s(ndevices), id);
1955 if (ndevices > 0) {
1956 result = add_result(op, host, ndevices, dev);
1957 }
1958
1959 if (pcmk_is_set(op->call_options, st_opt_topology)) {
1960
1961
1962
1963 if (op->state == st_query && all_topology_devices_found(op)) {
1964
1965 crm_trace("All topology devices found");
1966 call_remote_stonith(op, result, pcmk_ok);
1967
1968 } else if (have_all_replies) {
1969 crm_info("All topology query replies have arrived, continuing (%d expected/%d received) ",
1970 replies_expected, op->replies);
1971 call_remote_stonith(op, NULL, pcmk_ok);
1972 }
1973
1974 } else if (op->state == st_query) {
1975 int nverified = count_peer_devices(op, result, TRUE);
1976
1977
1978
1979 if (result && (host_is_target == FALSE) && nverified) {
1980
1981 crm_trace("Found %d verified device%s",
1982 nverified, pcmk__plural_s(nverified));
1983 call_remote_stonith(op, result, pcmk_ok);
1984
1985 } else if (have_all_replies) {
1986 crm_info("All query replies have arrived, continuing (%d expected/%d received) ",
1987 replies_expected, op->replies);
1988 call_remote_stonith(op, NULL, pcmk_ok);
1989
1990 } else {
1991 crm_trace("Waiting for more peer results before launching fencing operation");
1992 }
1993
1994 } else if (result && (op->state == st_done)) {
1995 crm_info("Discarding query result from %s (%d device%s): "
1996 "Operation is %s", result->host,
1997 result->ndevices, pcmk__plural_s(result->ndevices),
1998 stonith_op_state_str(op->state));
1999 }
2000
2001 return pcmk_ok;
2002 }
2003
2004
2005
2006
2007
2008
2009
2010
2011
2012
2013
2014
2015 int
2016 process_remote_stonith_exec(xmlNode * msg)
2017 {
2018 int rc = 0;
2019 const char *id = NULL;
2020 const char *device = NULL;
2021 remote_fencing_op_t *op = NULL;
2022 xmlNode *dev = get_xpath_object("//@" F_STONITH_REMOTE_OP_ID, msg, LOG_ERR);
2023
2024 CRM_CHECK(dev != NULL, return -EPROTO);
2025
2026 id = crm_element_value(dev, F_STONITH_REMOTE_OP_ID);
2027 CRM_CHECK(id != NULL, return -EPROTO);
2028
2029 dev = get_xpath_object("//@" F_STONITH_RC, msg, LOG_ERR);
2030 CRM_CHECK(dev != NULL, return -EPROTO);
2031
2032 crm_element_value_int(dev, F_STONITH_RC, &rc);
2033
2034 device = crm_element_value(dev, F_STONITH_DEVICE);
2035
2036 if (stonith_remote_op_list) {
2037 op = g_hash_table_lookup(stonith_remote_op_list, id);
2038 }
2039
2040 if (op == NULL && rc == pcmk_ok) {
2041
2042 const char *client_id = crm_element_value(dev, F_STONITH_CLIENTID);
2043
2044 op = create_remote_stonith_op(client_id, dev, TRUE);
2045 }
2046
2047 if (op == NULL) {
2048
2049
2050 crm_info("Received peer result of unknown or expired operation %s", id);
2051 return -EOPNOTSUPP;
2052 }
2053
2054 if (op->devices && device && !pcmk__str_eq(op->devices->data, device, pcmk__str_casei)) {
2055 crm_err("Received outdated reply for device %s (instead of %s) to "
2056 "fence (%s) %s. Operation already timed out at peer level.",
2057 device, (const char *) op->devices->data, op->action, op->target);
2058 return rc;
2059 }
2060
2061 if (pcmk__str_eq(crm_element_value(msg, F_SUBTYPE), "broadcast", pcmk__str_casei)) {
2062 crm_debug("Finalizing action '%s' targeting %s on behalf of %s@%s: %s "
2063 CRM_XS " rc=%d id=%.8s",
2064 op->action, op->target, op->client_name, op->originator,
2065 pcmk_strerror(rc), rc, op->id);
2066 if (rc == pcmk_ok) {
2067 op->state = st_done;
2068 } else {
2069 op->state = st_failed;
2070 }
2071 remote_op_done(op, msg, rc, FALSE);
2072 return pcmk_ok;
2073 } else if (!pcmk__str_eq(op->originator, stonith_our_uname, pcmk__str_casei)) {
2074
2075
2076 crm_err("Received non-broadcast fencing result for operation %.8s "
2077 "we do not own (device %s targeting %s)",
2078 op->id, device, op->target);
2079 return rc;
2080 }
2081
2082 if (pcmk_is_set(op->call_options, st_opt_topology)) {
2083 const char *device = crm_element_value(msg, F_STONITH_DEVICE);
2084
2085 crm_notice("Action '%s' targeting %s using %s on behalf of %s@%s: %s "
2086 CRM_XS " rc=%d",
2087 op->action, op->target, device, op->client_name,
2088 op->originator, pcmk_strerror(rc), rc);
2089
2090
2091
2092 if (op->state == st_done) {
2093 remote_op_done(op, msg, rc, FALSE);
2094 return rc;
2095 }
2096
2097 if ((op->phase == 2) && (rc != pcmk_ok)) {
2098
2099
2100
2101 crm_warn("Ignoring %s 'on' failure (exit code %d) targeting %s "
2102 "after successful 'off'", device, rc, op->target);
2103 rc = pcmk_ok;
2104 }
2105
2106 if (rc == pcmk_ok) {
2107
2108
2109 advance_topology_device_in_level(op, device, msg, rc);
2110 return rc;
2111 } else {
2112
2113
2114 if (advance_topology_level(op, false) != pcmk_rc_ok) {
2115 op->state = st_failed;
2116 remote_op_done(op, msg, rc, FALSE);
2117 return rc;
2118 }
2119 }
2120 } else if (rc == pcmk_ok && op->devices == NULL) {
2121 crm_trace("All done for %s", op->target);
2122
2123 op->state = st_done;
2124 remote_op_done(op, msg, rc, FALSE);
2125 return rc;
2126 } else if (rc == -ETIME && op->devices == NULL) {
2127
2128 op->state = st_failed;
2129 remote_op_done(op, msg, rc, FALSE);
2130 return rc;
2131 } else {
2132
2133 }
2134
2135
2136 crm_trace("Next for %s on behalf of %s@%s (rc was %d)", op->target, op->originator,
2137 op->client_name, rc);
2138 call_remote_stonith(op, NULL, rc);
2139 return rc;
2140 }
2141
2142 gboolean
2143 stonith_check_fence_tolerance(int tolerance, const char *target, const char *action)
2144 {
2145 GHashTableIter iter;
2146 time_t now = time(NULL);
2147 remote_fencing_op_t *rop = NULL;
2148
2149 if (tolerance <= 0 || !stonith_remote_op_list || target == NULL ||
2150 action == NULL) {
2151 return FALSE;
2152 }
2153
2154 g_hash_table_iter_init(&iter, stonith_remote_op_list);
2155 while (g_hash_table_iter_next(&iter, NULL, (void **)&rop)) {
2156 if (strcmp(rop->target, target) != 0) {
2157 continue;
2158 } else if (rop->state != st_done) {
2159 continue;
2160
2161
2162
2163 } else if (strcmp(rop->action, action) != 0) {
2164 continue;
2165 } else if ((rop->completed + tolerance) < now) {
2166 continue;
2167 }
2168
2169 crm_notice("Target %s was fenced (%s) less than %ds ago by %s on behalf of %s",
2170 target, action, tolerance, rop->delegate, rop->originator);
2171 return TRUE;
2172 }
2173 return FALSE;
2174 }