Revert "Add a --dynamic-flaky flag, allowing ecosystem-analyzer to short-circuit if it detects that all diagnostic changes are flaky" (#47)

AlexWaygood · web-flow · commit e7576e66e8f9 · 2026-04-22T15:31:34.000-04:00
diff --git a/src/ecosystem_analyzer/flaky.py b/src/ecosystem_analyzer/flaky.py
@@ -21,13 +21,6 @@ def _location_key(diag: Diagnostic) -> tuple[str, int, int]:
     return (diag["path"], diag["line"], diag["column"])
 
 
-def diagnostic_keys(
-    diagnostics: list[Diagnostic],
-) -> frozenset[tuple[str, int, int, str, str, str]]:
-    """Return the set of unique diagnostic keys for a list of diagnostics."""
-    return frozenset(_diagnostic_key(d) for d in diagnostics)
-
-
 def classify_diagnostics(
     all_runs: list[list[Diagnostic]],
 ) -> tuple[list[Diagnostic], list[FlakyLocation]]:
diff --git a/src/ecosystem_analyzer/main.py b/src/ecosystem_analyzer/main.py
@@ -289,14 +289,6 @@ def analyze(
     type=click.Path(exists=True, dir_okay=False, readable=True, path_type=Path),
     required=False,
 )
-@click.option(
-    "--dynamic-flaky",
-    is_flag=True,
-    help="Enable dynamic flaky detection: skip reruns when there are no changes "
-    "relative to baseline, and short-circuit when all changes are flaky. "
-    "--flaky-runs becomes the maximum number of runs. Note: flakiness is only "
-    "detected for diagnostics that differ from baseline.",
-)
 @click.pass_context
 def diff(
     ctx,
@@ -313,7 +305,6 @@ def diff(
     num_shards: int | None,
     ty_binary_old: Path | None,
     ty_binary_new: Path | None,
-    dynamic_flaky: bool,
 ) -> None:
     """
     Compare diagnostics between two commits.
@@ -337,10 +328,6 @@ def diff(
     ):
         raise click.UsageError(f"--shard must be in range [0, {num_shards})")
 
-    if dynamic_flaky and ctx.obj["flaky_runs"] < 2:
-        click.echo("Error: --dynamic-flaky requires --flaky-runs >= 2", err=True)
-        ctx.exit(1)
-
     project_names_old = Path(projects_old).read_text().splitlines()
     project_names_new = Path(projects_new).read_text().splitlines()
     flaky_project_names = (
@@ -372,17 +359,15 @@ def diff(
     )
 
     # Build (or use pre-built) old ty — building overlaps with background
-    # project installation. In dynamic mode, the old side runs once (no flaky
-    # detection) and its output is passed as a baseline so the new side can
-    # skip reruns for unchanged projects and short-circuit when all changes
-    # are flaky.
+    # project installation
     if ty_binary_old is not None:
         manager.use_prebuilt(ty_binary_old, old)
     else:
         manager.build(old)
 
+    # Run for old commit with old projects
     manager.activate(project_names_old)
-    run_outputs_old = manager.run_active_projects(single_run=dynamic_flaky)
+    run_outputs_old = manager.run_active_projects()
     manager.write_run_outputs(run_outputs_old, output_old)
 
     # Build (or use pre-built) new ty — incremental build is near-instant
@@ -391,9 +376,7 @@ def diff(
     else:
         manager.build(new)
     manager.activate(project_names_new)
-    run_outputs_new = manager.run_active_projects(
-        baseline=run_outputs_old if dynamic_flaky else None
-    )
+    run_outputs_new = manager.run_active_projects()
     manager.write_run_outputs(run_outputs_new, output_new)
 
 
diff --git a/src/ecosystem_analyzer/manager.py b/src/ecosystem_analyzer/manager.py
@@ -164,56 +164,26 @@ def run_for_commit(self, commit: str | Commit) -> list[RunOutput]:
         self._ensure_installed()
         return self._run_active_projects()
 
-    def run_active_projects(
-        self,
-        *,
-        baseline: list[RunOutput] | None = None,
-        single_run: bool = False,
-    ) -> list[RunOutput]:
-        """Run the current ty build on active projects.
-
-        When *baseline* is provided, flaky projects use dynamic detection
-        that can skip reruns or short-circuit early (see
-        ``Ty.run_on_project_dynamic``).  Without a baseline the fixed
-        ``--flaky-runs`` behaviour is used.
-
-        When *single_run* is True, every project runs exactly once
-        regardless of ``--flaky-runs`` — useful for establishing a
-        baseline for dynamic detection.
-        """
+    def run_active_projects(self) -> list[RunOutput]:
+        """Run the current ty build on active projects."""
         self._ensure_installed()
-        return self._run_active_projects(baseline=baseline, single_run=single_run)
-
-    def _is_flaky_project(self, project: InstalledProject) -> bool:
-        return self._flaky_runs > 1 and (
-            not self._flaky_projects or project.name in self._flaky_projects
-        )
-
-    def _run_active_projects(
-        self,
-        *,
-        baseline: list[RunOutput] | None = None,
-        single_run: bool = False,
-    ) -> list[RunOutput]:
-        assert not (single_run and baseline is not None), (
-            "single_run=True and baseline are mutually exclusive: a baseline "
-            "is only meaningful for dynamic flaky detection"
-        )
-        baseline_by_project: dict[str, RunOutput] = (
-            {o["project"]: o for o in baseline} if baseline is not None else {}
-        )
+        return self._run_active_projects()
 
+    def _run_active_projects(self) -> list[RunOutput]:
         run_outputs = []
         for project in self._active_projects:
-            if not single_run and self._is_flaky_project(project):
-                if baseline is not None:
-                    output = self._ty.run_on_project_dynamic(
-                        project,
-                        self._flaky_runs,
-                        baseline_by_project.get(project.name),
+            n = (
+                self._flaky_runs
+                if (
+                    self._flaky_runs > 1
+                    and (
+                        not self._flaky_projects or project.name in self._flaky_projects
                     )
-                else:
-                    output = self._ty.run_on_project_multiple(project, self._flaky_runs)
+                )
+                else 1
+            )
+            if n > 1:
+                output = self._ty.run_on_project_multiple(project, n)
             else:
                 output = self._ty.run_on_project(project)
             run_outputs.append(output)
diff --git a/src/ecosystem_analyzer/ty.py b/src/ecosystem_analyzer/ty.py
@@ -9,10 +9,10 @@
 
 from git import Commit, Repo
 
-from .diagnostic import Diagnostic, DiagnosticsParser
-from .flaky import classify_diagnostics, diagnostic_keys
+from .diagnostic import DiagnosticsParser
+from .flaky import classify_diagnostics
 from .installed_project import InstalledProject
-from .run_output import FlakyLocation, RunOutput
+from .run_output import RunOutput
 
 logger = logging.getLogger(__name__)
 
@@ -169,16 +169,49 @@ def run_on_project(self, project: InstalledProject) -> RunOutput:
             output["panic_messages"] = panic_messages
         return output
 
-    def _build_multi_run_result(
-        self,
-        project: InstalledProject,
-        stable: list[Diagnostic],
-        flaky_locations: list[FlakyLocation],
-        n: int,
-        times: list[float],
-        return_codes: list[int],
-    ) -> RunOutput:
-        """Build a RunOutput from the results of multiple ty runs."""
+    def run_on_project_multiple(self, project: InstalledProject, n: int) -> RunOutput:
+        """Run ty on a project N times and classify diagnostics as stable/flaky.
+
+        Returns a single RunOutput where `diagnostics` contains only stable
+        diagnostics and `flaky_diagnostics` contains grouped flaky ones.
+        """
+        assert n >= 2, "Use run_on_project for single runs"
+        logger.info(
+            f"Running ty on project '{project.name}' {n} times for flaky detection"
+        )
+
+        all_diagnostics: list[list] = []
+        times: list[float] = []
+        return_codes: list[int | None] = []
+
+        for i in range(n):
+            logger.info(f"  Run {i + 1}/{n} for '{project.name}'")
+            output = self.run_on_project(project)
+
+            # If any run fails abnormally, bail out and return the failure
+            if output.get("return_code") is not None and output["return_code"] not in (
+                0,
+                1,
+            ):
+                logger.warning(
+                    f"Run {i + 1}/{n} for '{project.name}' failed with return code "
+                    f"{output['return_code']}; aborting flaky detection"
+                )
+                return output
+            if output.get("return_code") is None:
+                # Timeout
+                logger.warning(
+                    f"Run {i + 1}/{n} for '{project.name}' timed out; aborting flaky detection"
+                )
+                return output
+
+            all_diagnostics.append(output["diagnostics"])
+            if (time_s := output.get("time_s")) is not None:
+                times.append(time_s)
+            return_codes.append(output.get("return_code"))
+
+        stable, flaky_locations = classify_diagnostics(all_diagnostics)
+
         # Use median time
         median_time: float | None = None
         if times:
@@ -187,7 +220,7 @@ def _build_multi_run_result(
             median_time = sorted_times[mid]
 
         # Use most common return code
-        rc_counts = Counter(return_codes)
+        rc_counts = Counter(rc for rc in return_codes if rc is not None)
         most_common_rc = rc_counts.most_common(1)[0][0] if rc_counts else None
 
         result = RunOutput({
@@ -211,146 +244,3 @@ def _build_multi_run_result(
         )
 
         return result
-
-    @staticmethod
-    def _run_aborted(
-        output: RunOutput, project: InstalledProject, run_idx: int, total: int
-    ) -> bool:
-        """Log and return True if this run's exit status aborts flaky detection."""
-        rc = output.get("return_code")
-        if rc is None:
-            logger.warning(
-                f"Run {run_idx}/{total} for '{project.name}' timed out; "
-                f"aborting flaky detection"
-            )
-            return True
-        if rc not in (0, 1):
-            logger.warning(
-                f"Run {run_idx}/{total} for '{project.name}' failed with return "
-                f"code {rc}; aborting flaky detection"
-            )
-            return True
-        return False
-
-    def run_on_project_multiple(self, project: InstalledProject, n: int) -> RunOutput:
-        """Run ty on a project N times and classify diagnostics as stable/flaky.
-
-        Returns a single RunOutput where `diagnostics` contains only stable
-        diagnostics and `flaky_diagnostics` contains grouped flaky ones.
-        """
-        assert n >= 2, "Use run_on_project for single runs"
-        logger.info(
-            f"Running ty on project '{project.name}' {n} times for flaky detection"
-        )
-
-        all_diagnostics: list[list] = []
-        times: list[float] = []
-        return_codes: list[int] = []
-
-        for i in range(n):
-            logger.info(f"  Run {i + 1}/{n} for '{project.name}'")
-            output = self.run_on_project(project)
-            if self._run_aborted(output, project, i + 1, n):
-                return output
-
-            all_diagnostics.append(output["diagnostics"])
-            if (time_s := output.get("time_s")) is not None:
-                times.append(time_s)
-            rc = output["return_code"]
-            assert rc is not None
-            return_codes.append(rc)
-
-        stable, flaky_locations = classify_diagnostics(all_diagnostics)
-        return self._build_multi_run_result(
-            project, stable, flaky_locations, n, times, return_codes
-        )
-
-    def run_on_project_dynamic(
-        self,
-        project: InstalledProject,
-        max_runs: int,
-        baseline: RunOutput | None,
-    ) -> RunOutput:
-        """Run ty with dynamic flaky detection that can short-circuit.
-
-        Compared to ``run_on_project_multiple`` (which always runs exactly N
-        times), this method can finish early:
-
-        1. If the first run produces identical diagnostics to *baseline*,
-           all reruns are skipped — there are no changes to investigate.
-        2. After each subsequent run (starting from run 2), if every
-           diagnostic that *differs* from the baseline has been classified as
-           flaky, the remaining runs are skipped.
-
-        *baseline* is typically the single-run output from the old commit in
-        a ``diff`` invocation.  When *baseline* is ``None`` (e.g. for a
-        newly added project), the empty set is used — so optimisation 1
-        fires only when the first run itself is empty, and optimisation 2
-        fires when every diagnostic turns out to be flaky.
-
-        Note: flakiness in diagnostics shared with *baseline* is only
-        missed when the first run happens to match *baseline* exactly —
-        Optimisation 1 skips reruns and we never get a chance to observe
-        the variation.  Once reruns do happen, every diagnostic is
-        classified via ``classify_diagnostics`` regardless of whether it
-        is shared with *baseline*.
-        """
-        assert max_runs >= 2, "Use run_on_project for single runs"
-        logger.info(
-            f"Running ty on project '{project.name}' with dynamic flaky detection "
-            f"(max {max_runs} runs)"
-        )
-
-        baseline_keys = (
-            diagnostic_keys(baseline["diagnostics"]) if baseline else frozenset()
-        )
-
-        logger.info(f"  Run 1/{max_runs} for '{project.name}'")
-        first_output = self.run_on_project(project)
-        if self._run_aborted(first_output, project, 1, max_runs):
-            return first_output
-
-        # Optimisation 1: no changes relative to baseline → skip reruns
-        if diagnostic_keys(first_output["diagnostics"]) == baseline_keys:
-            logger.info(f"  '{project.name}': no changes vs baseline, skipping reruns")
-            return first_output
-
-        all_diagnostics: list[list[Diagnostic]] = [first_output["diagnostics"]]
-        times: list[float] = []
-        if (t := first_output.get("time_s")) is not None:
-            times.append(t)
-        first_rc = first_output["return_code"]
-        assert first_rc is not None
-        return_codes: list[int] = [first_rc]
-
-        for i in range(1, max_runs):
-            logger.info(f"  Run {i + 1}/{max_runs} for '{project.name}'")
-            output = self.run_on_project(project)
-            if self._run_aborted(output, project, i + 1, max_runs):
-                return output
-
-            all_diagnostics.append(output["diagnostics"])
-            if (t := output.get("time_s")) is not None:
-                times.append(t)
-            rc = output["return_code"]
-            assert rc is not None
-            return_codes.append(rc)
-
-            stable, flaky_locations = classify_diagnostics(all_diagnostics)
-
-            # Optimisation 2: all changes vs baseline are flaky → short-circuit
-            if diagnostic_keys(stable) == baseline_keys:
-                logger.info(
-                    f"  '{project.name}': all changes are flaky after "
-                    f"{len(all_diagnostics)} runs, short-circuiting"
-                )
-                break
-
-        return self._build_multi_run_result(
-            project,
-            stable,
-            flaky_locations,
-            len(all_diagnostics),
-            times,
-            return_codes,
-        )
diff --git a/tests/test_dynamic_flaky.py b/tests/test_dynamic_flaky.py