root/daemons/controld/controld_throttle.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. load2str
  2. find_cib_loadfile
  3. throttle_cib_load
  4. throttle_load_avg
  5. throttle_check_thresholds
  6. throttle_handle_load
  7. throttle_mode
  8. throttle_send_command
  9. throttle_timer_cb
  10. throttle_record_free
  11. throttle_set_load_target
  12. throttle_update_job_max
  13. throttle_init
  14. controld_configure_throttle
  15. throttle_fini
  16. throttle_get_total_job_limit
  17. throttle_get_job_limit
  18. throttle_update

   1 /*
   2  * Copyright 2013-2024 the Pacemaker project contributors
   3  *
   4  * The version control history for this file may have further details.
   5  *
   6  * This source code is licensed under the GNU General Public License version 2
   7  * or later (GPLv2+) WITHOUT ANY WARRANTY.
   8  */
   9 
  10 #include <crm_internal.h>
  11 
  12 #include <sys/types.h>
  13 #include <sys/stat.h>
  14 
  15 #include <unistd.h>
  16 #include <ctype.h>
  17 #include <dirent.h>
  18 
  19 #include <crm/crm.h>
  20 #include <crm/common/xml.h>
  21 #include <crm/cluster.h>
  22 
  23 #include <pacemaker-controld.h>
  24 
  25 /* These values don't need to be bits, but these particular values must be kept
  26  * for backward compatibility during rolling upgrades.
  27  */
  28 enum throttle_state_e {
  29     throttle_none       = 0x0000,
  30     throttle_low        = 0x0001,
  31     throttle_med        = 0x0010,
  32     throttle_high       = 0x0100,
  33     throttle_extreme    = 0x1000,
  34 };
  35 
  36 struct throttle_record_s {
  37     int max;
  38     enum throttle_state_e mode;
  39     char *node;
  40 };
  41 
  42 static int throttle_job_max = 0;
  43 static float throttle_load_target = 0.0;
  44 
  45 #define THROTTLE_FACTOR_LOW    1.2
  46 #define THROTTLE_FACTOR_MEDIUM 1.6
  47 #define THROTTLE_FACTOR_HIGH   2.0
  48 
  49 static GHashTable *throttle_records = NULL;
  50 static mainloop_timer_t *throttle_timer = NULL;
  51 
  52 static const char *
  53 load2str(enum throttle_state_e mode)
     /* [previous][next][first][last][top][bottom][index][help] */
  54 {
  55     switch (mode) {
  56         case throttle_extreme:  return "extreme";
  57         case throttle_high:     return "high";
  58         case throttle_med:      return "medium";
  59         case throttle_low:      return "low";
  60         case throttle_none:     return "negligible";
  61         default:                return "undetermined";
  62     }
  63 }
  64 
  65 #if HAVE_LINUX_PROCFS
  66 /*!
  67  * \internal
  68  * \brief Return name of /proc file containing the CIB daemon's load statistics
  69  *
  70  * \return Newly allocated memory with file name on success, NULL otherwise
  71  *
  72  * \note It is the caller's responsibility to free the return value.
  73  *       This will return NULL if the daemon is being run via valgrind.
  74  *       This should be called only on Linux systems.
  75  */
  76 static char *
  77 find_cib_loadfile(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  78 {
  79     pid_t pid = pcmk__procfs_pid_of("pacemaker-based");
  80 
  81     return pid? crm_strdup_printf("/proc/%lld/stat", (long long) pid) : NULL;
  82 }
  83 
  84 static bool
  85 throttle_cib_load(float *load)
     /* [previous][next][first][last][top][bottom][index][help] */
  86 {
  87 /*
  88        /proc/[pid]/stat
  89               Status information about the process.  This is used by ps(1).  It is defined in /usr/src/linux/fs/proc/array.c.
  90 
  91               The fields, in order, with their proper scanf(3) format specifiers, are:
  92 
  93               pid %d      (1) The process ID.
  94 
  95               comm %s     (2) The filename of the executable, in parentheses.  This is visible whether or not the executable is swapped out.
  96 
  97               state %c    (3) One character from the string "RSDZTW" where R is running, S is sleeping in an interruptible wait, D is waiting in uninterruptible disk sleep, Z is zombie, T is traced or stopped (on a signal), and W is paging.
  98 
  99               ppid %d     (4) The PID of the parent.
 100 
 101               pgrp %d     (5) The process group ID of the process.
 102 
 103               session %d  (6) The session ID of the process.
 104 
 105               tty_nr %d   (7) The controlling terminal of the process.  (The minor device number is contained in the combination of bits 31 to 20 and 7 to 0; the major device number is in bits 15 to 8.)
 106 
 107               tpgid %d    (8) The ID of the foreground process group of the controlling terminal of the process.
 108 
 109               flags %u (%lu before Linux 2.6.22)
 110                           (9) The kernel flags word of the process.  For bit meanings, see the PF_* defines in the Linux kernel source file include/linux/sched.h.  Details depend on the kernel version.
 111 
 112               minflt %lu  (10) The number of minor faults the process has made which have not required loading a memory page from disk.
 113 
 114               cminflt %lu (11) The number of minor faults that the process's waited-for children have made.
 115 
 116               majflt %lu  (12) The number of major faults the process has made which have required loading a memory page from disk.
 117 
 118               cmajflt %lu (13) The number of major faults that the process's waited-for children have made.
 119 
 120               utime %lu   (14) Amount of time that this process has been scheduled in user mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)).  This includes guest time, guest_time (time spent running a virtual CPU, see below), so that applications that are not aware of the guest time field do not lose that time from their calculations.
 121 
 122               stime %lu   (15) Amount of time that this process has been scheduled in kernel mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)).
 123  */
 124 
 125     static char *loadfile = NULL;
 126     static time_t last_call = 0;
 127     static long ticks_per_s = 0;
 128     static unsigned long last_utime, last_stime;
 129 
 130     char buffer[64*1024];
 131     FILE *stream = NULL;
 132     time_t now = time(NULL);
 133 
 134     if(load == NULL) {
 135         return FALSE;
 136     } else {
 137         *load = 0.0;
 138     }
 139 
 140     if(loadfile == NULL) {
 141         last_call = 0;
 142         last_utime = 0;
 143         last_stime = 0;
 144         loadfile = find_cib_loadfile();
 145         if (loadfile == NULL) {
 146             crm_warn("Couldn't find CIB load file");
 147             return FALSE;
 148         }
 149         ticks_per_s = sysconf(_SC_CLK_TCK);
 150         crm_trace("Found %s", loadfile);
 151     }
 152 
 153     stream = fopen(loadfile, "r");
 154     if(stream == NULL) {
 155         int rc = errno;
 156 
 157         crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc);
 158         free(loadfile); loadfile = NULL;
 159         return FALSE;
 160     }
 161 
 162     if(fgets(buffer, sizeof(buffer), stream)) {
 163         char *comm = pcmk__assert_alloc(1, 256);
 164         char state = 0;
 165         int rc = 0, pid = 0, ppid = 0, pgrp = 0, session = 0, tty_nr = 0, tpgid = 0;
 166         unsigned long flags = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0, utime = 0, stime = 0;
 167 
 168         rc = sscanf(buffer,  "%d %[^ ] %c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
 169                     &pid, comm, &state,
 170                     &ppid, &pgrp, &session, &tty_nr, &tpgid,
 171                     &flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime);
 172         free(comm);
 173 
 174         if(rc != 15) {
 175             crm_err("Only %d of 15 fields found in %s", rc, loadfile);
 176             fclose(stream);
 177             return FALSE;
 178 
 179         } else if(last_call > 0
 180            && last_call < now
 181            && last_utime <= utime
 182            && last_stime <= stime) {
 183 
 184             time_t elapsed = now - last_call;
 185             unsigned long delta_utime = utime - last_utime;
 186             unsigned long delta_stime = stime - last_stime;
 187 
 188             *load = (delta_utime + delta_stime); /* Cast to a float before division */
 189             *load /= ticks_per_s;
 190             *load /= elapsed;
 191             crm_debug("cib load: %f (%lu ticks in %lds)", *load, delta_utime + delta_stime, (long)elapsed);
 192 
 193         } else {
 194             crm_debug("Init %lu + %lu ticks at %ld (%lu tps)", utime, stime, (long)now, ticks_per_s);
 195         }
 196 
 197         last_call = now;
 198         last_utime = utime;
 199         last_stime = stime;
 200 
 201         fclose(stream);
 202         return TRUE;
 203     }
 204 
 205     fclose(stream);
 206     return FALSE;
 207 }
 208 
 209 static bool
 210 throttle_load_avg(float *load)
     /* [previous][next][first][last][top][bottom][index][help] */
 211 {
 212     char buffer[256];
 213     FILE *stream = NULL;
 214     const char *loadfile = "/proc/loadavg";
 215 
 216     if(load == NULL) {
 217         return FALSE;
 218     }
 219 
 220     stream = fopen(loadfile, "r");
 221     if(stream == NULL) {
 222         int rc = errno;
 223         crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_rc_str(rc), rc);
 224         return FALSE;
 225     }
 226 
 227     if(fgets(buffer, sizeof(buffer), stream)) {
 228         char *nl = strstr(buffer, "\n");
 229 
 230         /* Grab the 1-minute average, ignore the rest */
 231         *load = strtof(buffer, NULL);
 232         if(nl) { nl[0] = 0; }
 233 
 234         fclose(stream);
 235         return TRUE;
 236     }
 237 
 238     fclose(stream);
 239     return FALSE;
 240 }
 241 
 242 /*!
 243  * \internal
 244  * \brief Check a load value against throttling thresholds
 245  *
 246  * \param[in] load        Load value to check
 247  * \param[in] desc        Description of metric (for logging)
 248  * \param[in] thresholds  Low/medium/high/extreme thresholds
 249  *
 250  * \return Throttle mode corresponding to load value
 251  */
 252 static enum throttle_state_e
 253 throttle_check_thresholds(float load, const char *desc,
     /* [previous][next][first][last][top][bottom][index][help] */
 254                           const float thresholds[4])
 255 {
 256     if (load > thresholds[3]) {
 257         crm_notice("Extreme %s detected: %f", desc, load);
 258         return throttle_extreme;
 259 
 260     } else if (load > thresholds[2]) {
 261         crm_notice("High %s detected: %f", desc, load);
 262         return throttle_high;
 263 
 264     } else if (load > thresholds[1]) {
 265         crm_info("Moderate %s detected: %f", desc, load);
 266         return throttle_med;
 267 
 268     } else if (load > thresholds[0]) {
 269         crm_debug("Noticeable %s detected: %f", desc, load);
 270         return throttle_low;
 271     }
 272 
 273     crm_trace("Negligible %s detected: %f", desc, load);
 274     return throttle_none;
 275 }
 276 
 277 static enum throttle_state_e
 278 throttle_handle_load(float load, const char *desc, int cores)
     /* [previous][next][first][last][top][bottom][index][help] */
 279 {
 280     float normalize;
 281     float thresholds[4];
 282 
 283     if (cores == 1) {
 284         /* On a single core machine, a load of 1.0 is already too high */
 285         normalize = 0.6;
 286 
 287     } else {
 288         /* Normalize the load to be per-core */
 289         normalize = cores;
 290     }
 291     thresholds[0] = throttle_load_target * normalize * THROTTLE_FACTOR_LOW;
 292     thresholds[1] = throttle_load_target * normalize * THROTTLE_FACTOR_MEDIUM;
 293     thresholds[2] = throttle_load_target * normalize * THROTTLE_FACTOR_HIGH;
 294     thresholds[3] = load + 1.0; /* never extreme */
 295 
 296     return throttle_check_thresholds(load, desc, thresholds);
 297 }
 298 #endif // HAVE_LINUX_PROCFS
 299 
 300 static enum throttle_state_e
 301 throttle_mode(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 302 {
 303     enum throttle_state_e mode = throttle_none;
 304 
 305 #if HAVE_LINUX_PROCFS
 306     unsigned int cores;
 307     float load;
 308     float thresholds[4];
 309 
 310     cores = pcmk__procfs_num_cores();
 311     if(throttle_cib_load(&load)) {
 312         float cib_max_cpu = 0.95;
 313 
 314         /* The CIB is a single-threaded task and thus cannot consume
 315          * more than 100% of a CPU (and 1/cores of the overall system
 316          * load).
 317          *
 318          * On a many-cored system, the CIB might therefore be maxed out
 319          * (causing operations to fail or appear to fail) even though
 320          * the overall system load is still reasonable.
 321          *
 322          * Therefore, the 'normal' thresholds can not apply here, and we
 323          * need a special case.
 324          */
 325         if(cores == 1) {
 326             cib_max_cpu = 0.4;
 327         }
 328         if(throttle_load_target > 0.0 && throttle_load_target < cib_max_cpu) {
 329             cib_max_cpu = throttle_load_target;
 330         }
 331 
 332         thresholds[0] = cib_max_cpu * 0.8;
 333         thresholds[1] = cib_max_cpu * 0.9;
 334         thresholds[2] = cib_max_cpu;
 335         /* Can only happen on machines with a low number of cores */
 336         thresholds[3] = cib_max_cpu * 1.5;
 337 
 338         mode = throttle_check_thresholds(load, "CIB load", thresholds);
 339     }
 340 
 341     if(throttle_load_target <= 0) {
 342         /* If we ever make this a valid value, the cluster will at least behave as expected */
 343         return mode;
 344     }
 345 
 346     if(throttle_load_avg(&load)) {
 347         enum throttle_state_e cpu_load;
 348 
 349         cpu_load = throttle_handle_load(load, "CPU load", cores);
 350         if (cpu_load > mode) {
 351             mode = cpu_load;
 352         }
 353         crm_debug("Current load is %f across %u core(s)", load, cores);
 354     }
 355 #endif // HAVE_LINUX_PROCFS
 356     return mode;
 357 }
 358 
 359 static void
 360 throttle_send_command(enum throttle_state_e mode)
     /* [previous][next][first][last][top][bottom][index][help] */
 361 {
 362     xmlNode *xml = NULL;
 363     static enum throttle_state_e last = -1;
 364 
 365     if(mode != last) {
 366         crm_info("New throttle mode: %s load (was %s)",
 367                  load2str(mode), load2str(last));
 368         last = mode;
 369 
 370         xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
 371         crm_xml_add_int(xml, PCMK__XA_CRM_LIMIT_MODE, mode);
 372         crm_xml_add_int(xml, PCMK__XA_CRM_LIMIT_MAX, throttle_job_max);
 373 
 374         pcmk__cluster_send_message(NULL, crm_msg_crmd, xml);
 375         free_xml(xml);
 376     }
 377 }
 378 
 379 static gboolean
 380 throttle_timer_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 381 {
 382     throttle_send_command(throttle_mode());
 383     return TRUE;
 384 }
 385 
 386 static void
 387 throttle_record_free(gpointer p)
     /* [previous][next][first][last][top][bottom][index][help] */
 388 {
 389     struct throttle_record_s *r = p;
 390     free(r->node);
 391     free(r);
 392 }
 393 
 394 static void
 395 throttle_set_load_target(float target)
     /* [previous][next][first][last][top][bottom][index][help] */
 396 {
 397     throttle_load_target = target;
 398 }
 399 
 400 /*!
 401  * \internal
 402  * \brief Update the maximum number of simultaneous jobs
 403  *
 404  * \param[in] preference  Cluster-wide \c PCMK_OPT_NODE_ACTION_LIMIT from the
 405  *                        CIB
 406  */
 407 static void
 408 throttle_update_job_max(const char *preference)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410     long long max = 0LL;
 411 
 412     // Per-node override
 413     const char *env_limit = pcmk__env_option(PCMK__ENV_NODE_ACTION_LIMIT);
 414 
 415     if (env_limit != NULL) {
 416         int rc = pcmk__scan_ll(env_limit, &max, 0LL);
 417 
 418         if (rc != pcmk_rc_ok) {
 419             crm_warn("Ignoring local option PCMK_" PCMK__ENV_NODE_ACTION_LIMIT
 420                      " because '%s' is not a valid value: %s",
 421                      env_limit, pcmk_rc_str(rc));
 422             env_limit = NULL;
 423         }
 424     }
 425     if (env_limit == NULL) {
 426         // Option validator should prevent invalid values
 427         CRM_LOG_ASSERT(pcmk__scan_ll(preference, &max, 0LL) == pcmk_rc_ok);
 428     }
 429 
 430     if (max > 0) {
 431         throttle_job_max = (max >= INT_MAX)? INT_MAX : (int) max;
 432     } else {
 433         // Default is based on the number of cores detected
 434         throttle_job_max = 2 * pcmk__procfs_num_cores();
 435     }
 436 }
 437 
 438 void
 439 throttle_init(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 440 {
 441     if(throttle_records == NULL) {
 442         throttle_records = pcmk__strkey_table(NULL, throttle_record_free);
 443         throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL);
 444     }
 445 
 446     throttle_update_job_max(NULL);
 447     mainloop_timer_start(throttle_timer);
 448 }
 449 
 450 /*!
 451  * \internal
 452  * \brief Configure throttle options based on the CIB
 453  *
 454  * \param[in,out] options  Name/value pairs for configured options
 455  */
 456 void
 457 controld_configure_throttle(GHashTable *options)
     /* [previous][next][first][last][top][bottom][index][help] */
 458 {
 459     const char *value = g_hash_table_lookup(options, PCMK_OPT_LOAD_THRESHOLD);
 460 
 461     if (value != NULL) {
 462         throttle_set_load_target(strtof(value, NULL) / 100.0);
 463     }
 464 
 465     value = g_hash_table_lookup(options, PCMK_OPT_NODE_ACTION_LIMIT);
 466     throttle_update_job_max(value);
 467 }
 468 
 469 void
 470 throttle_fini(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 471 {
 472     if (throttle_timer != NULL) {
 473         mainloop_timer_del(throttle_timer);
 474         throttle_timer = NULL;
 475     }
 476     if (throttle_records != NULL) {
 477         g_hash_table_destroy(throttle_records);
 478         throttle_records = NULL;
 479     }
 480 }
 481 
 482 int
 483 throttle_get_total_job_limit(int l)
     /* [previous][next][first][last][top][bottom][index][help] */
 484 {
 485     /* Cluster-wide limit */
 486     GHashTableIter iter;
 487     int limit = l;
 488     int peers = pcmk__cluster_num_active_nodes();
 489     struct throttle_record_s *r = NULL;
 490 
 491     g_hash_table_iter_init(&iter, throttle_records);
 492 
 493     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &r)) {
 494         switch(r->mode) {
 495 
 496             case throttle_extreme:
 497                 if(limit == 0 || limit > peers/4) {
 498                     limit = QB_MAX(1, peers/4);
 499                 }
 500                 break;
 501 
 502             case throttle_high:
 503                 if(limit == 0 || limit > peers/2) {
 504                     limit = QB_MAX(1, peers/2);
 505                 }
 506                 break;
 507             default:
 508                 break;
 509         }
 510     }
 511     if(limit == l) {
 512 
 513     } else if(l == 0) {
 514         crm_trace("Using " PCMK_OPT_BATCH_LIMIT "=%d", limit);
 515 
 516     } else {
 517         crm_trace("Using " PCMK_OPT_BATCH_LIMIT "=%d instead of %d", limit, l);
 518     }
 519     return limit;
 520 }
 521 
 522 int
 523 throttle_get_job_limit(const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 524 {
 525     int jobs = 1;
 526     struct throttle_record_s *r = NULL;
 527 
 528     r = g_hash_table_lookup(throttle_records, node);
 529     if(r == NULL) {
 530         r = pcmk__assert_alloc(1, sizeof(struct throttle_record_s));
 531         r->node = pcmk__str_copy(node);
 532         r->mode = throttle_low;
 533         r->max = throttle_job_max;
 534         crm_trace("Defaulting to local values for unknown node %s", node);
 535 
 536         g_hash_table_insert(throttle_records, r->node, r);
 537     }
 538 
 539     switch(r->mode) {
 540         case throttle_extreme:
 541         case throttle_high:
 542             jobs = 1; /* At least one job must always be allowed */
 543             break;
 544         case throttle_med:
 545             jobs = QB_MAX(1, r->max / 4);
 546             break;
 547         case throttle_low:
 548             jobs = QB_MAX(1, r->max / 2);
 549             break;
 550         case throttle_none:
 551             jobs = QB_MAX(1, r->max);
 552             break;
 553         default:
 554             crm_err("Unknown throttle mode %.4x on %s", r->mode, node);
 555             break;
 556     }
 557     return jobs;
 558 }
 559 
 560 void
 561 throttle_update(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
 562 {
 563     int max = 0;
 564     int mode = 0;
 565     struct throttle_record_s *r = NULL;
 566     const char *from = crm_element_value(xml, PCMK__XA_SRC);
 567 
 568     crm_element_value_int(xml, PCMK__XA_CRM_LIMIT_MODE, &mode);
 569     crm_element_value_int(xml, PCMK__XA_CRM_LIMIT_MAX, &max);
 570 
 571     r = g_hash_table_lookup(throttle_records, from);
 572 
 573     if(r == NULL) {
 574         r = pcmk__assert_alloc(1, sizeof(struct throttle_record_s));
 575         r->node = pcmk__str_copy(from);
 576         g_hash_table_insert(throttle_records, r->node, r);
 577     }
 578 
 579     r->max = max;
 580     r->mode = (enum throttle_state_e) mode;
 581 
 582     crm_debug("Node %s has %s load and supports at most %d jobs; new job limit %d",
 583               from, load2str((enum throttle_state_e) mode), max,
 584               throttle_get_job_limit(from));
 585 }

/* [previous][next][first][last][top][bottom][index][help] */