psf · mdboom · Dec 19, 2024 · Dec 17, 2024 · Dec 17, 2024 · Dec 17, 2024
diff --git a/pyperf/__main__.py b/pyperf/__main__.py
@@ -491,10 +491,13 @@ def display_benchmarks(args, show_metadata=False, hist=False, stats=False,
                 empty_line(output)
                 output.extend(lines)
 
+        contains_warning = False
         for line in output:
+            if line.startswith("WARNING:"):
+                contains_warning = True
             print(line)
 
-        if not output and only_checks:
+        if not contains_warning and only_checks:
             if len(data) == 1:
                 print("The benchmark seems to be stable")
             else:

diff --git a/pyperf/_bench.py b/pyperf/_bench.py
@@ -424,6 +424,39 @@ def median_abs_dev(self):
             raise ValueError("MAD must be >= 0")
         return value
 
+    def required_nsamples(self):
+        """
+        Determines the number of samples that would be required to have 95%
+        certainty that the samples have a variance of less than 1%.
+
+        This is described in this Wikipedia article about estimating the sampling of
+        a mean:
+
+        https://en.wikipedia.org/wiki/Sample_size_determination#Estimation_of_a_mean
+        """
+        # Get the means of the values per run
+        values = []
+        for run in self._runs:
+            if len(run.values):
+                values.append(statistics.mean(run.values))
+
+        if len(values) < 2:
+            return None
+
+        total = math.fsum(values)
+        mean = total / len(values)
+        stddev = statistics.stdev(values)
+        # Normalize the stddev so we can target "percentage changed" rather than
+        # absolute time
+        sigma = stddev / mean
+
+        # 95% certainty
+        Z = 1.96
+        # 1% variation
+        W = 0.01
+
+        return int(math.ceil((4 * Z ** 2 * sigma ** 2) / (W ** 2)))
+
     def percentile(self, p):
         if not (0 <= p <= 100):
             raise ValueError("p must be in the range [0; 100]")

diff --git a/pyperf/_cli.py b/pyperf/_cli.py
@@ -413,6 +413,8 @@ def format_checks(bench, lines=None):
     warnings = []
     warn = warnings.append
 
+    required_nsamples = bench.required_nsamples()
+
     # Display a warning if the standard deviation is greater than 10%
     # of the mean
     if len(values) >= 2:
@@ -421,6 +423,13 @@ def format_checks(bench, lines=None):
         if percent >= 10.0:
             warn("the standard deviation (%s) is %.0f%% of the mean (%s)"
                  % (bench.format_value(stdev), percent, bench.format_value(mean)))
+        else:
+            # display a warning if the number of samples isn't enough to get a stable result
+            if (
+                required_nsamples is not None and
+                required_nsamples > len(bench._runs)
+            ):
+                warn("Not enough samples to get a stable result (95% certainly of less than 1% variation)")
 
     # Minimum and maximum, detect obvious outliers
     for minimum, value in (
@@ -457,6 +466,16 @@ def format_checks(bench, lines=None):
         lines.append("Use pyperf stats, pyperf dump and pyperf hist to analyze results.")
         lines.append("Use --quiet option to hide these warnings.")
 
+    if (
+        required_nsamples is not None and
+        required_nsamples < len(bench._runs) * 0.75
+    ):
+        lines.append("Benchmark was run more times than necessary to get a stable result.")
+        lines.append(
+            "Consider passing processes=%d to the Runner constructor to save time." %
+            required_nsamples
+        )
+
     # Warn if nohz_full+intel_pstate combo if found in cpu_config metadata
     for run in bench._runs:
         cpu_config = run._metadata.get('cpu_config')

diff --git a/pyperf/tests/test_perf_cli.py b/pyperf/tests/test_perf_cli.py
@@ -478,11 +478,16 @@ def test_hist(self):
             22.8 ms:  3 ##############
             22.9 ms:  4 ###################
             22.9 ms:  4 ###################
+            Benchmark was run more times than necessary to get a stable result.
+            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'hist', TELCO, env=env)
 
     def test_show(self):
         expected = ("""
+            Benchmark was run more times than necessary to get a stable result.
+            Consider passing processes=7 to the Runner constructor to save time.
+
             Mean +- std dev: 22.5 ms +- 0.2 ms
         """)
         self.check_command(expected, 'show', TELCO)
@@ -518,6 +523,8 @@ def test_stats(self):
             100th percentile: 22.9 ms (+2% of the mean) -- maximum
 
             Number of outlier (out of 22.0 ms..23.0 ms): 0
+            Benchmark was run more times than necessary to get a stable result.
+            Consider passing processes=7 to the Runner constructor to save time.
         """)
         self.check_command(expected, 'stats', TELCO)
 
@@ -628,8 +635,10 @@ def test_slowest(self):
 
     def test_check_stable(self):
         stdout = self.run_command('check', TELCO)
-        self.assertEqual(stdout.rstrip(),
-                         'The benchmark seems to be stable')
+        self.assertTrue(
+            'The benchmark seems to be stable' in
+            stdout.rstrip()
+        )
 
     def test_command(self):
         command = [sys.executable, '-c', 'pass']
@@ -689,7 +698,7 @@ def _check_track_memory(self, track_option):
                              '[1,2]*1000',
                              '-o', tmp_name)
             bench = pyperf.Benchmark.load(tmp_name)
-        
+
         self._check_track_memory_bench(bench, loops=5)
 
     def test_track_memory(self):