-
Notifications
You must be signed in to change notification settings - Fork 13
Refine the doubling logic #418 #420
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: MOODLE_403_STABLE
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -28,21 +28,18 @@ | |
| * @copyright 2022, Catalyst IT | ||
| * @license http://www.gnu.org/copyleft/gpl.html GNU GPL v3 or later | ||
| */ | ||
| class cron_processor implements processor { | ||
| class cron_processor extends processor { | ||
| /** @var float Timestamp updated after processing each sample */ | ||
| public $sampletime; | ||
|
|
||
| /** @var sample_set A sample set recorded while processing a task */ | ||
| public $tasksampleset = null; | ||
|
|
||
| /** @var sample_set A sample set for memory usage recorded while processing a task */ | ||
| protected $memoryusagesampleset; | ||
|
|
||
| /** @var int */ | ||
| protected $samplems; | ||
|
|
||
| /** @var bool */ | ||
| protected static $alreadyprofiling = false; | ||
| /** | ||
| * Construct the web processor. | ||
| */ | ||
| public function __construct() { | ||
| // Preload config values to avoid DB access during processing. See manager::get_altconnection() for more information. | ||
| parent::__construct(); | ||
| $this->minduration = (float) get_config('tool_excimer', 'task_min_duration'); | ||
| } | ||
|
|
||
| /** | ||
| * Initialises the processor | ||
|
|
@@ -52,18 +49,14 @@ class cron_processor implements processor { | |
| public function init(manager $manager) { | ||
| $this->sampletime = $manager->get_starttime(); | ||
|
|
||
| $manager->get_timer()->setCallback(function () use ($manager) { | ||
| $this->on_interval($manager); | ||
| }); | ||
|
|
||
| // Preload config values to avoid DB access during processing. See manager::get_altconnection() for more information. | ||
| $this->samplems = (int) get_config('tool_excimer', 'sample_ms'); | ||
| // The callback is triggered when the number of samples reach the maxsamples and when profiler is destroyed. | ||
| $manager->get_profiler()->setFlushCallback(function ($log) use ($manager) { | ||
| $this->on_interval($log, $manager); | ||
| }, $this->maxsamples); | ||
|
|
||
| \core_shutdown_manager::register_function( | ||
| function () use ($manager) { | ||
| $manager->get_timer()->stop(); | ||
| $manager->get_profiler()->stop(); | ||
| $this->on_interval($manager, true); | ||
| if ($this->tasksampleset) { | ||
| $this->process($manager, microtime(true)); | ||
| } | ||
|
|
@@ -72,13 +65,21 @@ function () use ($manager) { | |
| } | ||
|
|
||
| /** | ||
| * Gets the minimum duration required for a profile to be saved, as seconds. | ||
| * | ||
| * @return float | ||
| * @throws \dml_exception | ||
| * Get sampling period of processor | ||
| * (used for testing) | ||
| * @return int sampling period | ||
| */ | ||
| public function get_min_duration(): float { | ||
| return (float) get_config('tool_excimer', 'task_min_duration'); | ||
| public function get_sampling_period() { | ||
| return $this->samplingperiod; | ||
| } | ||
|
|
||
| /** | ||
| * Get sample limit of processor | ||
| * (used for testing) | ||
| * @return int sample_limit | ||
| */ | ||
| public function get_sample_limit() { | ||
| return $this->samplelimit; | ||
| } | ||
|
|
||
| /** | ||
|
|
@@ -92,65 +93,71 @@ public function get_min_duration(): float { | |
| * | ||
| * We then check for a new task with the current sample. | ||
| * | ||
| * @param \ExcimerLog $log | ||
| * @param manager $manager | ||
| * @param bool $isfinal Set to true when this is called during shutdown. | ||
| */ | ||
| public function on_interval(manager $manager, bool $isfinal = false) { | ||
| public function on_interval($log, manager $manager) { | ||
| // We want to prevent doubling up of processing, so skip if an existing process is still executing. | ||
| // The profile logs will be kept and processed the next time. | ||
| self::$logs[] = $log; | ||
| $this->logcount += $log->count(); | ||
| // Doubling sampling period if it reaches the limit. | ||
| if ($this->logcount >= $this->samplelimit) { | ||
| $this->on_reach_limit($manager); | ||
| $this->logcount = $this->logcount - $this->samplelimit; | ||
| } | ||
| if (self::$alreadyprofiling) { | ||
| debugging('tool_excimer: starting cron_processor::on_interval when previous call has not yet finished'); | ||
| if ($isfinal) { | ||
| // The final flush call when profiler is destroyed. | ||
| if ($log->count() < $this->samplelimit) { | ||
| // This should never happen. | ||
| debugging('tool_excimer: alreadyprofiling is true during final on_interval.'); | ||
| } | ||
| return; | ||
| } | ||
| self::$alreadyprofiling = true; | ||
|
Comment on lines
100
to
118
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The logic to prevent double up of processing may not be needed anymore now that we've switched from timers to flush callbacks (I'm not sure if this is paused during the callbacks). I think it's worth looking into further because it would be great if we could remove $self::logs entirely.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At this stage, the exact underlying scenario isn’t clear because the documentation doesn’t cover this case. However, this likely happens when maxSamples is set too low, causing the process to continue running while callbacks are triggered repeatedly. In this scenario, self::logs can be used for stored new coming samples.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Previously there were edge cases of the 10s interval overlapping, see #377 With your changes these should be much rarer as in the worst case the number of possible samples in on_interval should be very low. I'm not even sure if the profiler would be taking samples here. I'm interested in what happens when max samples is set too low. The previous issue this fixed shouldn't be a concern if it only happens with bad config, but is there a chance of endless loops if this is set to something ridiculous like 1?
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, it should be possible, because we don’t have any constraint on the maxSamples value. That’s why I introduced the $logs array as a waiting list. New logs from the callback are collected in this array until the current process completes. I’m not sure how often overlaps will occur with the new logic, but $logs won’t impact the existing flow. On the contrary, it helps mitigate overlapping if it does occur. |
||
| foreach (self::$logs as $log) { | ||
| $memoryusage = memory_get_usage(); | ||
bwalkerl marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| foreach ($log as $sample) { | ||
| $taskname = $this->findtaskname($sample); | ||
| $sampletime = $manager->get_starttime() + $sample->getTimestamp(); | ||
|
|
||
| // If there is a task and the current task name is different from the previous, then store the profile. | ||
| if ($this->tasksampleset && ($this->tasksampleset->name != $taskname)) { | ||
| $profile = $this->process($manager, $this->sampletime); | ||
| // Keep an approximate count of each profile. | ||
| page_group::record_fuzzy_counts($profile); | ||
| $this->tasksampleset = null; | ||
| } | ||
|
Comment on lines
+122
to
+131
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think detecting new tasks like this is incompatible with the increased sample periods. Because of this, new tasks will start off with the increased sample periods from the previous tasks and likely be missing the most important parts of the profile. This is likely the biggest issues here and I'm not sure the best way of handling this.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In my view, the sampling period isn’t tied to a specific task. It’s increased to reduce impact on site performance, as stated. So even if there’s a new task, if the site is very busy, a higher sampling period is still reasonable. Regarding missing information: if a task is lightweight and completes before the next sampling interval, I think that’s acceptable. Our focus is on bottlenecks (heavy tasks). Lightweight tasks are also filtered out, at least at the stack trace level, by the merge logic.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This is a serious regression, and is much more broken than you think. Multiple cron tasks are processed in a single call. The sampling period is never reset. A task that starts several hours in would have a ridiculously high sampling period and not contain the samples required to debug it properly. Even if you reset that when a new task is detected, the gap of processing at higher intervals is likely to cause issues following long tasks. On top of this going from a higher sampling period to a lower sampling period in the middle of the task would significantly distort the display and misrepresent parts of the task.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think we need specific cases to evaluate this. It’s too abstract for me to visualize what’s happening behind the scenes. Based on what you’re saying, it sounds like we should not be updating the sampling period.
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Yes, I feel like this is the main reason cron was processed on interval before. This did come with the drawback of taking extra samples and then discarding them based on the filter it, so there's still value in swapping if we can handle this somehow.
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I’m happy to remove the sampling-period doubling and keep the flush callback, but I can’t justify the mismatch between what we are trying to solve and what we’re doing. At this point, the sample limit is primarily to prevent excessive log storage, rather than to address sampling spam during long-running tasks. |
||
|
|
||
| $profiler = $manager->get_profiler(); | ||
| $log = $profiler->flush(); | ||
| $memoryusage = memory_get_usage(); // Record and set initial memory usage at this point. | ||
| foreach ($log as $sample) { | ||
| $taskname = $this->findtaskname($sample); | ||
| $sampletime = $manager->get_starttime() + $sample->getTimestamp(); | ||
|
|
||
| // If there is a task and the current task name is different from the previous, then store the profile. | ||
| if ($this->tasksampleset && ($this->tasksampleset->name != $taskname)) { | ||
| $profile = $this->process($manager, $this->sampletime); | ||
|
|
||
| // Keep an approximate count of each profile. | ||
| page_group::record_fuzzy_counts($profile); | ||
| $this->tasksampleset = null; | ||
| } | ||
|
|
||
| // If there exists a current task, and the sampleset for it is not created yet, create it. | ||
| if ($taskname && ($this->tasksampleset === null)) { | ||
| $this->tasksampleset = new sample_set($taskname, $this->sampletime); | ||
| $this->memoryusagesampleset = new sample_set($taskname, $this->sampletime); | ||
| if ($memoryusage) { // Ensure this only adds the mem usage for the initial base sample due to accuracy. | ||
| $this->memoryusagesampleset->add_sample(['sampleindex' => 0, 'value' => $memoryusage]); | ||
| $memoryusage = 0; | ||
| // If there exists a current task, and the sampleset for it is not created yet, create it. | ||
| if ($taskname && ($this->tasksampleset === null)) { | ||
| $this->tasksampleset = new sample_set($taskname, $this->sampletime); | ||
| $this->memoryusagesampleset = new sample_set($taskname, $this->sampletime); | ||
| if ($memoryusage) { // Ensure this only adds the mem usage for the initial base sample due to accuracy. | ||
| $this->memoryusagesampleset->add_sample(['sampleindex' => 0, 'value' => $memoryusage]); | ||
| $memoryusage = 0; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| // If the sampleset exists, add the current sample to it. | ||
| if ($this->tasksampleset) { | ||
| $this->tasksampleset->add_sample($sample); | ||
| // If the sampleset exists, add the current sample to it. | ||
| if ($this->tasksampleset) { | ||
| $this->tasksampleset->add_sample($sample); | ||
|
|
||
| // Add memory usage: | ||
| // Note that due to the looping this is probably inaccurate. | ||
| $this->memoryusagesampleset->add_sample([ | ||
| 'sampleindex' => $this->tasksampleset->total_added() - 1, | ||
| 'value' => memory_get_usage(), | ||
| ]); | ||
| } | ||
|
|
||
| // Add memory usage: | ||
| // Note that due to the looping this is probably inaccurate. | ||
| $this->memoryusagesampleset->add_sample([ | ||
| 'sampleindex' => $this->tasksampleset->total_added() + $this->memoryusagesampleset->count() - 1, | ||
| 'value' => memory_get_usage(), | ||
| ]); | ||
| // Instances of task_sample are always created with the previous sample's timestamp. | ||
| // So it needs to be saved each loop. | ||
| $this->sampletime = $sampletime; | ||
| } | ||
|
|
||
| // Instances of task_sample are always created with the previous sample's timestamp. | ||
| // So it needs to be saved each loop. | ||
| $this->sampletime = $sampletime; | ||
| } | ||
|
|
||
| self::$logs = []; | ||
| self::$alreadyprofiling = false; | ||
| } | ||
|
|
||
|
|
@@ -196,9 +203,10 @@ public function process(manager $manager, float $finishtime): profile { | |
| $profile->set('reason', $reasons); | ||
| $profile->set('finished', (int) $finishtime); | ||
| $profile->set('memoryusagedatad3', $this->memoryusagesampleset->samples); | ||
| $profile->set('flamedatad3', flamed3_node::from_excimer_log_entries($this->tasksampleset->samples)); | ||
| $profile->set('numsamples', $this->tasksampleset->count()); | ||
| $profile->set('samplerate', $this->tasksampleset->filter_rate() * $this->samplems); | ||
| $profile->set('flamedatad3', flamed3_node::from_sample_set_samples($this->tasksampleset->samples)); | ||
| $profile->set('numsamples', count($this->tasksampleset->samples)); | ||
| $profile->set('numevents', $this->tasksampleset->count()); | ||
| $profile->set('samplerate', (int) (($duration * 1000) / $this->tasksampleset->count())); | ||
| $profile->save_record(); | ||
| } | ||
| return $profile; | ||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -33,18 +33,62 @@ | |
| * | ||
| * @package tool_excimer | ||
| */ | ||
| interface processor { | ||
| class processor { | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'd recommend making this an abstract class and keeping init as an abstract method |
||
| /** @var int */ | ||
| protected $minduration; | ||
|
|
||
| /** @var sample_set */ | ||
| public $tasksampleset; | ||
|
|
||
| /** @var sample_set */ | ||
| public $memoryusagesampleset; | ||
|
|
||
| /** @var int */ | ||
| protected $samplingperiod; | ||
|
|
||
| /** @var int */ | ||
| protected $samplelimit; | ||
|
|
||
| /** @var int */ | ||
| protected $maxsamples; | ||
|
|
||
| /** @var int */ | ||
| protected $logcount = 0; | ||
|
|
||
| /** @var bool */ | ||
| protected static $alreadyprofiling = false; | ||
|
|
||
| /** @var array */ | ||
| protected static $logs = []; | ||
|
|
||
| /** | ||
| * Construct the processor. | ||
| */ | ||
| public function __construct() { | ||
| // Preload config values to avoid DB access during processing. See manager::get_altconnection() for more information. | ||
| $this->samplingperiod = script_metadata::get_sampling_period(); | ||
| $this->samplelimit = script_metadata::get_sample_limit(); | ||
| $this->maxsamples = script_metadata::get_max_samples(); | ||
| } | ||
|
|
||
| /** | ||
| * Initialises the processor | ||
| * Doubling the sampling period when we reach the samples limit | ||
| * | ||
| * @param manager $manager The profiler manager object | ||
| * @param manager $manager | ||
| */ | ||
| public function init(manager $manager); | ||
| public function on_reach_limit(manager $manager) { | ||
| $this->samplingperiod *= 2; | ||
| // This will take effect the next time start() is called. | ||
| $manager->get_profiler()->setPeriod($this->samplingperiod); | ||
| $manager->get_profiler()->start(); | ||
| } | ||
|
|
||
| /** | ||
| * Gets the minimum duration required for a profile to be saved, as seconds. | ||
| * | ||
| * @return float | ||
| */ | ||
| public function get_min_duration(): float; | ||
| public function get_min_duration(): float { | ||
| return $this->minduration; | ||
| } | ||
| } | ||
Uh oh!
There was an error while loading. Please reload this page.