root/crmd/throttle.c

/* [previous][next][first][last][top][bottom][index][help] */

DEFINITIONS

This source file includes following definitions.
  1. find_cib_loadfile
  2. throttle_cib_load
  3. throttle_load_avg
  4. throttle_check_thresholds
  5. throttle_handle_load
  6. throttle_mode
  7. throttle_send_command
  8. throttle_timer_cb
  9. throttle_record_free
  10. throttle_set_load_target
  11. throttle_update_job_max
  12. throttle_init
  13. throttle_fini
  14. throttle_get_total_job_limit
  15. throttle_get_job_limit
  16. throttle_update

   1 /*
   2  * Copyright (C) 2013 Andrew Beekhof <andrew@beekhof.net>
   3  *
   4  * This program is free software; you can redistribute it and/or
   5  * modify it under the terms of the GNU General Public
   6  * License as published by the Free Software Foundation; either
   7  * version 2 of the License, or (at your option) any later version.
   8  *
   9  * This software is distributed in the hope that it will be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
  12  * General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public
  15  * License along with this library; if not, write to the Free Software
  16  * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 
  19 #include <crm_internal.h>
  20 
  21 #include <sys/types.h>
  22 #include <sys/stat.h>
  23 
  24 #include <unistd.h>
  25 #include <ctype.h>
  26 #include <dirent.h>
  27 
  28 #include <crm/crm.h>
  29 #include <crm/msg_xml.h>
  30 #include <crm/cluster.h>
  31 
  32 #include <crmd_fsa.h>
  33 #include <throttle.h>
  34 
  35 
  36 enum throttle_state_e {
  37     throttle_extreme = 0x1000,
  38     throttle_high = 0x0100,
  39     throttle_med  = 0x0010,
  40     throttle_low  = 0x0001,
  41     throttle_none = 0x0000,
  42 };
  43 
  44 struct throttle_record_s {
  45     int max;
  46     enum throttle_state_e mode;
  47     char *node;
  48 };
  49 
  50 static int throttle_job_max = 0;
  51 static float throttle_load_target = 0.0;
  52 
  53 #define THROTTLE_FACTOR_LOW    1.2
  54 #define THROTTLE_FACTOR_MEDIUM 1.6
  55 #define THROTTLE_FACTOR_HIGH   2.0
  56 
  57 static GHashTable *throttle_records = NULL;
  58 static mainloop_timer_t *throttle_timer = NULL;
  59 
  60 /*!
  61  * \internal
  62  * \brief Return name of /proc file containing the CIB deamon's load statistics
  63  *
  64  * \return Newly allocated memory with file name on success, NULL otherwise
  65  *
  66  * \note It is the caller's responsibility to free the return value.
  67  *       This will return NULL if the daemon is being run via valgrind.
  68  *       This should be called only on Linux systems.
  69  */
  70 static char *
  71 find_cib_loadfile(void)
     /* [previous][next][first][last][top][bottom][index][help] */
  72 {
  73     int pid = crm_procfs_pid_of("cib");
  74 
  75     return pid? crm_strdup_printf("/proc/%d/stat", pid) : NULL;
  76 }
  77 
  78 static bool
  79 throttle_cib_load(float *load)
     /* [previous][next][first][last][top][bottom][index][help] */
  80 {
  81 /*
  82        /proc/[pid]/stat
  83               Status information about the process.  This is used by ps(1).  It is defined in /usr/src/linux/fs/proc/array.c.
  84 
  85               The fields, in order, with their proper scanf(3) format specifiers, are:
  86 
  87               pid %d      (1) The process ID.
  88 
  89               comm %s     (2) The filename of the executable, in parentheses.  This is visible whether or not the executable is swapped out.
  90 
  91               state %c    (3) One character from the string "RSDZTW" where R is running, S is sleeping in an interruptible wait, D is waiting in uninterruptible disk sleep, Z is zombie, T is traced or stopped (on a signal), and W is paging.
  92 
  93               ppid %d     (4) The PID of the parent.
  94 
  95               pgrp %d     (5) The process group ID of the process.
  96 
  97               session %d  (6) The session ID of the process.
  98 
  99               tty_nr %d   (7) The controlling terminal of the process.  (The minor device number is contained in the combination of bits 31 to 20 and 7 to 0; the major device number is in bits 15 to 8.)
 100 
 101               tpgid %d    (8) The ID of the foreground process group of the controlling terminal of the process.
 102 
 103               flags %u (%lu before Linux 2.6.22)
 104                           (9) The kernel flags word of the process.  For bit meanings, see the PF_* defines in the Linux kernel source file include/linux/sched.h.  Details depend on the kernel version.
 105 
 106               minflt %lu  (10) The number of minor faults the process has made which have not required loading a memory page from disk.
 107 
 108               cminflt %lu (11) The number of minor faults that the process's waited-for children have made.
 109 
 110               majflt %lu  (12) The number of major faults the process has made which have required loading a memory page from disk.
 111 
 112               cmajflt %lu (13) The number of major faults that the process's waited-for children have made.
 113 
 114               utime %lu   (14) Amount of time that this process has been scheduled in user mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)).  This includes guest time, guest_time (time spent running a virtual CPU, see below), so that applications that are not aware of the guest time field do not lose that time from their calculations.
 115 
 116               stime %lu   (15) Amount of time that this process has been scheduled in kernel mode, measured in clock ticks (divide by sysconf(_SC_CLK_TCK)).
 117  */
 118 
 119     static char *loadfile = NULL;
 120     static time_t last_call = 0;
 121     static long ticks_per_s = 0;
 122     static unsigned long last_utime, last_stime;
 123 
 124     char buffer[64*1024];
 125     FILE *stream = NULL;
 126     time_t now = time(NULL);
 127 
 128     if(load == NULL) {
 129         return FALSE;
 130     } else {
 131         *load = 0.0;
 132     }
 133 
 134     if(loadfile == NULL) {
 135         last_call = 0;
 136         last_utime = 0;
 137         last_stime = 0;
 138         loadfile = find_cib_loadfile();
 139         if (loadfile == NULL) {
 140             crm_warn("Couldn't find CIB load file");
 141             return FALSE;
 142         }
 143         ticks_per_s = sysconf(_SC_CLK_TCK);
 144         crm_trace("Found %s", loadfile);
 145     }
 146 
 147     stream = fopen(loadfile, "r");
 148     if(stream == NULL) {
 149         int rc = errno;
 150 
 151         crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
 152         free(loadfile); loadfile = NULL;
 153         return FALSE;
 154     }
 155 
 156     if(fgets(buffer, sizeof(buffer), stream)) {
 157         char *comm = calloc(1, 256);
 158         char state = 0;
 159         int rc = 0, pid = 0, ppid = 0, pgrp = 0, session = 0, tty_nr = 0, tpgid = 0;
 160         unsigned long flags = 0, minflt = 0, cminflt = 0, majflt = 0, cmajflt = 0, utime = 0, stime = 0;
 161 
 162         rc = sscanf(buffer,  "%d %[^ ] %c %d %d %d %d %d %lu %lu %lu %lu %lu %lu %lu",
 163                     &pid, comm, &state,
 164                     &ppid, &pgrp, &session, &tty_nr, &tpgid,
 165                     &flags, &minflt, &cminflt, &majflt, &cmajflt, &utime, &stime);
 166         free(comm);
 167 
 168         if(rc != 15) {
 169             crm_err("Only %d of 15 fields found in %s", rc, loadfile);
 170             fclose(stream);
 171             return FALSE;
 172 
 173         } else if(last_call > 0
 174            && last_call < now
 175            && last_utime <= utime
 176            && last_stime <= stime) {
 177 
 178             time_t elapsed = now - last_call;
 179             unsigned long delta_utime = utime - last_utime;
 180             unsigned long delta_stime = stime - last_stime;
 181 
 182             *load = (delta_utime + delta_stime); /* Cast to a float before division */
 183             *load /= ticks_per_s;
 184             *load /= elapsed;
 185             crm_debug("cib load: %f (%lu ticks in %lds)", *load, delta_utime + delta_stime, (long)elapsed);
 186 
 187         } else {
 188             crm_debug("Init %lu + %lu ticks at %ld (%lu tps)", utime, stime, (long)now, ticks_per_s);
 189         }
 190 
 191         last_call = now;
 192         last_utime = utime;
 193         last_stime = stime;
 194 
 195         fclose(stream);
 196         return TRUE;
 197     }
 198 
 199     fclose(stream);
 200     return FALSE;
 201 }
 202 
 203 static bool
 204 throttle_load_avg(float *load)
     /* [previous][next][first][last][top][bottom][index][help] */
 205 {
 206     char buffer[256];
 207     FILE *stream = NULL;
 208     const char *loadfile = "/proc/loadavg";
 209 
 210     if(load == NULL) {
 211         return FALSE;
 212     }
 213 
 214     stream = fopen(loadfile, "r");
 215     if(stream == NULL) {
 216         int rc = errno;
 217         crm_warn("Couldn't read %s: %s (%d)", loadfile, pcmk_strerror(rc), rc);
 218         return FALSE;
 219     }
 220 
 221     if(fgets(buffer, sizeof(buffer), stream)) {
 222         char *nl = strstr(buffer, "\n");
 223 
 224         /* Grab the 1-minute average, ignore the rest */
 225         *load = strtof(buffer, NULL);
 226         if(nl) { nl[0] = 0; }
 227 
 228         fclose(stream);
 229         return TRUE;
 230     }
 231 
 232     fclose(stream);
 233     return FALSE;
 234 }
 235 
 236 /*!
 237  * \internal
 238  * \brief Check a load value against throttling thresholds
 239  *
 240  * \param[in] load        Load value to check
 241  * \param[in] desc        Description of metric (for logging)
 242  * \param[in] thresholds  Low/medium/high/extreme thresholds
 243  *
 244  * \return Throttle mode corresponding to load value
 245  */
 246 static enum throttle_state_e
 247 throttle_check_thresholds(float load, const char *desc, float thresholds[4])
     /* [previous][next][first][last][top][bottom][index][help] */
 248 {
 249     if (load > thresholds[3]) {
 250         crm_notice("Extreme %s detected: %f", desc, load);
 251         return throttle_extreme;
 252 
 253     } else if (load > thresholds[2]) {
 254         crm_notice("High %s detected: %f", desc, load);
 255         return throttle_high;
 256 
 257     } else if (load > thresholds[1]) {
 258         crm_info("Moderate %s detected: %f", desc, load);
 259         return throttle_med;
 260 
 261     } else if (load > thresholds[0]) {
 262         crm_debug("Noticeable %s detected: %f", desc, load);
 263         return throttle_low;
 264     }
 265 
 266     crm_trace("Negligible %s detected: %f", desc, load);
 267     return throttle_none;
 268 }
 269 
 270 static enum throttle_state_e
 271 throttle_handle_load(float load, const char *desc, int cores)
     /* [previous][next][first][last][top][bottom][index][help] */
 272 {
 273     float normalize;
 274     float thresholds[4];
 275 
 276     if (cores == 1) {
 277         /* On a single core machine, a load of 1.0 is already too high */
 278         normalize = 0.6;
 279 
 280     } else {
 281         /* Normalize the load to be per-core */
 282         normalize = cores;
 283     }
 284     thresholds[0] = throttle_load_target * normalize * THROTTLE_FACTOR_LOW;
 285     thresholds[1] = throttle_load_target * normalize * THROTTLE_FACTOR_MEDIUM;
 286     thresholds[2] = throttle_load_target * normalize * THROTTLE_FACTOR_HIGH;
 287     thresholds[3] = load + 1.0; /* never extreme */
 288 
 289     return throttle_check_thresholds(load, desc, thresholds);
 290 }
 291 
 292 static enum throttle_state_e
 293 throttle_mode(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 294 {
 295     unsigned int cores;
 296     float load;
 297     float thresholds[4];
 298     enum throttle_state_e mode = throttle_none;
 299 
 300 #if defined(ON_BSD) || defined(ON_SOLARIS)
 301     return throttle_none;
 302 #endif
 303 
 304     cores = crm_procfs_num_cores();
 305     if(throttle_cib_load(&load)) {
 306         float cib_max_cpu = 0.95;
 307 
 308         /* The CIB is a single-threaded task and thus cannot consume
 309          * more than 100% of a CPU (and 1/cores of the overall system
 310          * load).
 311          *
 312          * On a many-cored system, the CIB might therefore be maxed out
 313          * (causing operations to fail or appear to fail) even though
 314          * the overall system load is still reasonable.
 315          *
 316          * Therefore, the 'normal' thresholds can not apply here, and we
 317          * need a special case.
 318          */
 319         if(cores == 1) {
 320             cib_max_cpu = 0.4;
 321         }
 322         if(throttle_load_target > 0.0 && throttle_load_target < cib_max_cpu) {
 323             cib_max_cpu = throttle_load_target;
 324         }
 325 
 326         thresholds[0] = cib_max_cpu * 0.8;
 327         thresholds[1] = cib_max_cpu * 0.9;
 328         thresholds[2] = cib_max_cpu;
 329         /* Can only happen on machines with a low number of cores */
 330         thresholds[3] = cib_max_cpu * 1.5;
 331 
 332         mode |= throttle_check_thresholds(load, "CIB load", thresholds);
 333     }
 334 
 335     if(throttle_load_target <= 0) {
 336         /* If we ever make this a valid value, the cluster will at least behave as expected */
 337         return mode;
 338     }
 339 
 340     if(throttle_load_avg(&load)) {
 341         crm_debug("Current load is %f across %u core(s)", load, cores);
 342         mode |= throttle_handle_load(load, "CPU load", cores);
 343     }
 344 
 345     if(mode & throttle_extreme) {
 346         return throttle_extreme;
 347     } else if(mode & throttle_high) {
 348         return throttle_high;
 349     } else if(mode & throttle_med) {
 350         return throttle_med;
 351     } else if(mode & throttle_low) {
 352         return throttle_low;
 353     }
 354     return throttle_none;
 355 }
 356 
 357 static void
 358 throttle_send_command(enum throttle_state_e mode)
     /* [previous][next][first][last][top][bottom][index][help] */
 359 {
 360     xmlNode *xml = NULL;
 361     static enum throttle_state_e last = -1;
 362 
 363     if(mode != last) {
 364         crm_info("New throttle mode: %.4x (was %.4x)", mode, last);
 365         last = mode;
 366 
 367         xml = create_request(CRM_OP_THROTTLE, NULL, NULL, CRM_SYSTEM_CRMD, CRM_SYSTEM_CRMD, NULL);
 368         crm_xml_add_int(xml, F_CRM_THROTTLE_MODE, mode);
 369         crm_xml_add_int(xml, F_CRM_THROTTLE_MAX, throttle_job_max);
 370 
 371         send_cluster_message(NULL, crm_msg_crmd, xml, TRUE);
 372         free_xml(xml);
 373     }
 374 }
 375 
 376 static gboolean
 377 throttle_timer_cb(gpointer data)
     /* [previous][next][first][last][top][bottom][index][help] */
 378 {
 379     static bool send_updates = FALSE;
 380     enum throttle_state_e now = throttle_none;
 381 
 382     if(send_updates) {
 383         now = throttle_mode();
 384         throttle_send_command(now);
 385 
 386     } else if(compare_version(fsa_our_dc_version, "3.0.8") < 0) {
 387         /* Optimize for the true case */
 388         crm_trace("DC version %s doesn't support throttling", fsa_our_dc_version);
 389 
 390     } else {
 391         send_updates = TRUE;
 392         now = throttle_mode();
 393         throttle_send_command(now);
 394     }
 395 
 396     return TRUE;
 397 }
 398 
 399 static void
 400 throttle_record_free(gpointer p)
     /* [previous][next][first][last][top][bottom][index][help] */
 401 {
 402     struct throttle_record_s *r = p;
 403     free(r->node);
 404     free(r);
 405 }
 406 
 407 void
 408 throttle_set_load_target(float target)
     /* [previous][next][first][last][top][bottom][index][help] */
 409 {
 410     throttle_load_target = target;
 411 }
 412 
 413 void
 414 throttle_update_job_max(const char *preference)
     /* [previous][next][first][last][top][bottom][index][help] */
 415 {
 416     int max = 0;
 417 
 418     throttle_job_max = 2 * crm_procfs_num_cores();
 419 
 420     if(preference) {
 421         /* Global preference from the CIB */
 422         max = crm_int_helper(preference, NULL);
 423         if(max > 0) {
 424             throttle_job_max = max;
 425         }
 426     }
 427 
 428     preference = getenv("LRMD_MAX_CHILDREN");
 429     if(preference) {
 430         /* Legacy env variable */
 431         max = crm_int_helper(preference, NULL);
 432         if(max > 0) {
 433             throttle_job_max = max;
 434         }
 435     }
 436 
 437     preference = getenv("PCMK_node_action_limit");
 438     if(preference) {
 439         /* Per-node override */
 440         max = crm_int_helper(preference, NULL);
 441         if(max > 0) {
 442             throttle_job_max = max;
 443         }
 444     }
 445 }
 446 
 447 void
 448 throttle_init(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 449 {
 450     if(throttle_records == NULL) {
 451         throttle_records = g_hash_table_new_full(
 452             crm_str_hash, g_str_equal, NULL, throttle_record_free);
 453         throttle_timer = mainloop_timer_add("throttle", 30 * 1000, TRUE, throttle_timer_cb, NULL);
 454     }
 455 
 456     throttle_update_job_max(NULL);
 457     mainloop_timer_start(throttle_timer);
 458 }
 459 
 460 void
 461 throttle_fini(void)
     /* [previous][next][first][last][top][bottom][index][help] */
 462 {
 463     mainloop_timer_del(throttle_timer); throttle_timer = NULL;
 464     g_hash_table_destroy(throttle_records); throttle_records = NULL;
 465 }
 466 
 467 int
 468 throttle_get_total_job_limit(int l)
     /* [previous][next][first][last][top][bottom][index][help] */
 469 {
 470     /* Cluster-wide limit */
 471     GHashTableIter iter;
 472     int limit = l;
 473     int peers = crm_active_peers();
 474     struct throttle_record_s *r = NULL;
 475 
 476     g_hash_table_iter_init(&iter, throttle_records);
 477 
 478     while (g_hash_table_iter_next(&iter, NULL, (gpointer *) &r)) {
 479         switch(r->mode) {
 480 
 481             case throttle_extreme:
 482                 if(limit == 0 || limit > peers/4) {
 483                     limit = QB_MAX(1, peers/4);
 484                 }
 485                 break;
 486 
 487             case throttle_high:
 488                 if(limit == 0 || limit > peers/2) {
 489                     limit = QB_MAX(1, peers/2);
 490                 }
 491                 break;
 492             default:
 493                 break;
 494         }
 495     }
 496     if(limit == l) {
 497         /* crm_trace("No change to batch-limit=%d", limit); */
 498 
 499     } else if(l == 0) {
 500         crm_trace("Using batch-limit=%d", limit);
 501 
 502     } else {
 503         crm_trace("Using batch-limit=%d instead of %d", limit, l);
 504     }
 505     return limit;
 506 }
 507 
 508 int
 509 throttle_get_job_limit(const char *node)
     /* [previous][next][first][last][top][bottom][index][help] */
 510 {
 511     int jobs = 1;
 512     struct throttle_record_s *r = NULL;
 513 
 514     r = g_hash_table_lookup(throttle_records, node);
 515     if(r == NULL) {
 516         r = calloc(1, sizeof(struct throttle_record_s));
 517         r->node = strdup(node);
 518         r->mode = throttle_low;
 519         r->max = throttle_job_max;
 520         crm_trace("Defaulting to local values for unknown node %s", node);
 521 
 522         g_hash_table_insert(throttle_records, r->node, r);
 523     }
 524 
 525     switch(r->mode) {
 526         case throttle_extreme:
 527         case throttle_high:
 528             jobs = 1; /* At least one job must always be allowed */
 529             break;
 530         case throttle_med:
 531             jobs = QB_MAX(1, r->max / 4);
 532             break;
 533         case throttle_low:
 534             jobs = QB_MAX(1, r->max / 2);
 535             break;
 536         case throttle_none:
 537             jobs = QB_MAX(1, r->max);
 538             break;
 539         default:
 540             crm_err("Unknown throttle mode %.4x on %s", r->mode, node);
 541             break;
 542     }
 543     return jobs;
 544 }
 545 
 546 void
 547 throttle_update(xmlNode *xml)
     /* [previous][next][first][last][top][bottom][index][help] */
 548 {
 549     int max = 0;
 550     enum throttle_state_e mode = 0;
 551     struct throttle_record_s *r = NULL;
 552     const char *from = crm_element_value(xml, F_CRM_HOST_FROM);
 553 
 554     crm_element_value_int(xml, F_CRM_THROTTLE_MODE, (int*)&mode);
 555     crm_element_value_int(xml, F_CRM_THROTTLE_MAX, &max);
 556 
 557     r = g_hash_table_lookup(throttle_records, from);
 558 
 559     if(r == NULL) {
 560         r = calloc(1, sizeof(struct throttle_record_s));
 561         r->node = strdup(from);
 562         g_hash_table_insert(throttle_records, r->node, r);
 563     }
 564 
 565     r->max = max;
 566     r->mode = mode;
 567 
 568     crm_debug("Host %s supports a maximum of %d jobs and throttle mode %.4x.  New job limit is %d",
 569               from, max, mode, throttle_get_job_limit(from));
 570 }

/* [previous][next][first][last][top][bottom][index][help] */