diff --git a/doc/api.rst b/doc/api.rst index 5647d15..538ceec 100644 --- a/doc/api.rst +++ b/doc/api.rst @@ -345,6 +345,19 @@ Benchmark class Raise an exception if the benchmark has no values. + .. method:: required_nprocesses() + + Determines the number of separate process runs that would be required + achieve stable results. Specifically, the target is to have 95% certainty + that there is a variance of less than 1%. If the result is greater than + the number of processes recorded in the input data, the value is + meaningless and only means "more samples are required". + + The method used is described in this Wikipedia article about estimating + the sampling of a mean: + + https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean + .. method:: update_metadata(metadata: dict) Update metadata of all runs of the benchmark. diff --git a/pyperf/__main__.py b/pyperf/__main__.py index 639ffeb..572253a 100644 --- a/pyperf/__main__.py +++ b/pyperf/__main__.py @@ -455,7 +455,8 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False, dump=dump, checks=checks, result=result, - display_runs_args=display_runs_args) + display_runs_args=display_runs_args, + only_checks=only_checks) if bench_lines: empty_line(lines) diff --git a/pyperf/_bench.py b/pyperf/_bench.py index 6278e8a..5faa736 100644 --- a/pyperf/_bench.py +++ b/pyperf/_bench.py @@ -424,17 +424,23 @@ def median_abs_dev(self): raise ValueError("MAD must be >= 0") return value - def required_nsamples(self): + def required_nprocesses(self): """ - Determines the number of samples that would be required to have 95% - certainty that the samples have a variance of less than 1%. + Determines the number of separate process runs that would be required + achieve stable results. Specifically, the target is to have 95% + certainty that there is a variance of less than 1%. If the result is + greater than the number of processes recorded in the input data, the + value is meaningless and only means "more samples are required". - This is described in this Wikipedia article about estimating the sampling of - a mean: + The method used is described in this Wikipedia article about estimating + the sampling of a mean: https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean """ - # Get the means of the values per run + # Get the means of the values per process. The values within the process + # often vary considerably (e.g. due to cache effects), but the variances + # between processes should be fairly consistent. Additionally, this + # value is intended to be advice for the number of processes to run. values = [] for run in self._runs: if len(run.values): @@ -446,6 +452,7 @@ def required_nsamples(self): total = math.fsum(values) mean = total / len(values) stddev = statistics.stdev(values) + # Normalize the stddev so we can target "percentage changed" rather than # absolute time sigma = stddev / mean @@ -455,6 +462,7 @@ def required_nsamples(self): # 1% variation W = 0.01 + # (4Z²σ²)/(W²) return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2))) def percentile(self, p): diff --git a/pyperf/_cli.py b/pyperf/_cli.py index 266031e..6625e19 100644 --- a/pyperf/_cli.py +++ b/pyperf/_cli.py @@ -400,7 +400,7 @@ def value_bucket(value): return lines -def format_checks(bench, lines=None): +def format_checks(bench, lines=None, check_too_many_processes=False): if lines is None: lines = [] @@ -413,7 +413,7 @@ def format_checks(bench, lines=None): warnings = [] warn = warnings.append - required_nsamples = bench.required_nsamples() + required_nprocesses = bench.required_nprocesses() # Display a warning if the standard deviation is greater than 10% # of the mean @@ -426,8 +426,8 @@ def format_checks(bench, lines=None): else: # display a warning if the number of samples isn't enough to get a stable result if ( - required_nsamples is not None and - required_nsamples > len(bench._runs) + required_nprocesses is not None and + required_nprocesses > len(bench._runs) ): warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)") @@ -467,13 +467,14 @@ def format_checks(bench, lines=None): lines.append("Use --quiet option to hide these warnings.") if ( - required_nsamples is not None and - required_nsamples < len(bench._runs) * 0.75 + check_too_many_processes and + required_nprocesses is not None and + required_nprocesses < len(bench._runs) * 0.75 ): lines.append("Benchmark was run more times than necessary to get a stable result.") lines.append( "Consider passing processes=%d to the Runner constructor to save time." % - required_nsamples + required_nprocesses ) # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata @@ -568,7 +569,7 @@ def format_result(bench): def format_benchmark(bench, checks=True, metadata=False, dump=False, stats=False, hist=False, show_name=False, - result=True, display_runs_args=None): + result=True, display_runs_args=None, only_checks=False): lines = [] if metadata: @@ -587,7 +588,7 @@ def format_benchmark(bench, checks=True, metadata=False, format_stats(bench, lines=lines) if checks: - format_checks(bench, lines=lines) + format_checks(bench, lines=lines, check_too_many_processes=only_checks) if result: empty_line(lines) diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py index 8ce62cc..0e01380 100644 --- a/pyperf/tests/test_perf_cli.py +++ b/pyperf/tests/test_perf_cli.py @@ -478,16 +478,11 @@ def test_hist(self): 22.8 ms: 3 ############## 22.9 ms: 4 ################### 22.9 ms: 4 ################### - Benchmark was run more times than necessary to get a stable result. - Consider passing processes=7 to the Runner constructor to save time. """) self.check_command(expected, 'hist', TELCO, env=env) def test_show(self): expected = (""" - Benchmark was run more times than necessary to get a stable result. - Consider passing processes=7 to the Runner constructor to save time. - Mean +- std dev: 22.5 ms +- 0.2 ms """) self.check_command(expected, 'show', TELCO) @@ -523,8 +518,6 @@ def test_stats(self): 100th percentile: 22.9 ms (+2% of the mean) -- maximum Number of outlier (out of 22.0 ms..23.0 ms): 0 - Benchmark was run more times than necessary to get a stable result. - Consider passing processes=7 to the Runner constructor to save time. """) self.check_command(expected, 'stats', TELCO) @@ -635,6 +628,14 @@ def test_slowest(self): def test_check_stable(self): stdout = self.run_command('check', TELCO) + self.assertTrue( + textwrap.dedent( + """ + Benchmark was run more times than necessary to get a stable result. + Consider passing processes=7 to the Runner constructor to save time. + """ + ).strip() in stdout.rstrip() + ) self.assertTrue( 'The benchmark seems to be stable' in stdout.rstrip()