diff --git a/code_review_graph/cli.py b/code_review_graph/cli.py index 102ad8a8..cf1d2f60 100644 --- a/code_review_graph/cli.py +++ b/code_review_graph/cli.py @@ -88,6 +88,16 @@ def _supports_color() -> bool: return sys.stdout.isatty() +def _print_disambiguated(result: dict, limit: int = 10) -> None: + """Report duplicate symbol names that were disambiguated during a build.""" + dups = result.get("disambiguated_nodes") or [] + if not dups: + return + shown = ", ".join(dups[:limit]) + more = f" (+{len(dups) - limit} more)" if len(dups) > limit else "" + print(f"Disambiguated {len(dups)} duplicate symbol name(s): {shown}{more}") + + def _print_banner() -> None: """Print the startup banner with graph art and available commands.""" color = _supports_color() @@ -994,6 +1004,7 @@ def main() -> None: nodes = result.get("total_nodes", 0) edges = result.get("total_edges", 0) print(f"Full build: {parsed} files, {nodes} nodes, {edges} edges (postprocess={pp})") + _print_disambiguated(result) if result.get("errors"): print(f"Errors: {len(result['errors'])}") @@ -1019,6 +1030,7 @@ def main() -> None: f"{nodes} nodes, {edges} edges" f" (postprocess={pp})" ) + _print_disambiguated(result) # --brief: append a one-line change-impact summary with the same # estimated context-savings approximation that detect-changes uses. diff --git a/code_review_graph/graph.py b/code_review_graph/graph.py index 4ed3c7fc..ba26df53 100644 --- a/code_review_graph/graph.py +++ b/code_review_graph/graph.py @@ -10,6 +10,7 @@ import json import logging import os +import re import sqlite3 import threading import time @@ -25,6 +26,10 @@ logger = logging.getLogger(__name__) +# Suffix appended by _qualified_names_for_file to a 2nd+ same-file collision, +# e.g. "path::getById:L394". This regex recognizes those keys after the fact. +_DISAMBIGUATED_RE = re.compile(r":L\d+(?:#\d+)?$") + # --------------------------------------------------------------------------- # Schema # --------------------------------------------------------------------------- @@ -186,10 +191,19 @@ def close(self) -> None: # --- Write operations --- - def upsert_node(self, node: NodeInfo, file_hash: str = "") -> int: - """Insert or update a node. Returns the node ID.""" + def upsert_node( + self, node: NodeInfo, file_hash: str = "", qualified: str | None = None + ) -> int: + """Insert or update a node. Returns the node ID. + + Pass ``qualified`` to override the computed key — used by the batch store + functions to disambiguate same-named symbols in one file (see + ``_qualified_names_for_file``). Without it, collisions would collapse + under ``ON CONFLICT(qualified_name) DO UPDATE`` and silently drop nodes. + """ now = time.time() - qualified = self._make_qualified(node) + if qualified is None: + qualified = self._make_qualified(node) extra = json.dumps(node.extra) if node.extra else "{}" self._conn.execute( @@ -276,8 +290,9 @@ def store_file_nodes_edges( self._begin_immediate() try: self.remove_file_data(file_path) - for node in nodes: - self.upsert_node(node, file_hash=fhash) + qualified_names = self._qualified_names_for_file(nodes) + for node, qualified in zip(nodes, qualified_names): + self.upsert_node(node, file_hash=fhash, qualified=qualified) for edge in edges: self.upsert_edge(edge) self._conn.commit() @@ -294,8 +309,9 @@ def store_file_batch( try: for file_path, nodes, edges, fhash in batch: self.remove_file_data(file_path) - for node in nodes: - self.upsert_node(node, file_hash=fhash) + qualified_names = self._qualified_names_for_file(nodes) + for node, qualified in zip(nodes, qualified_names): + self.upsert_node(node, file_hash=fhash, qualified=qualified) for edge in edges: self.upsert_edge(edge) self._conn.commit() @@ -866,6 +882,21 @@ def get_stats(self) -> GraphStats: last_updated=last_updated, ) + def find_disambiguated_nodes(self) -> list[str]: + """Qualified names suffixed to resolve a same-file name collision. + + When two same-named symbols share a file, the first keeps its bare key + and later ones get a ``:L`` suffix (see _qualified_names_for_file). + Surfacing these makes the otherwise-invisible collisions reportable. + """ + rows = self._conn.execute( + "SELECT qualified_name FROM nodes WHERE qualified_name GLOB '*:L[0-9]*'" + ).fetchall() + return sorted( + r["qualified_name"] for r in rows + if _DISAMBIGUATED_RE.search(r["qualified_name"]) + ) + def get_nodes_by_size( self, min_lines: int = 50, @@ -1300,6 +1331,29 @@ def _make_qualified(self, node: NodeInfo) -> str: return f"{node.file_path}::{node.parent_name}.{node.name}" return f"{node.file_path}::{node.name}" + def _qualified_names_for_file(self, nodes: list[NodeInfo]) -> list[str]: + """Compute collision-free qualified names for one file's nodes. + + The first occurrence of a key keeps its bare form so existing edges + (which reference the same parser-computed key) still resolve to it. + Later same-key symbols are suffixed with their start line — two defs + cannot share a ``line_start``, so this is always unique. + """ + names: list[str] = [] + seen: set[str] = set() + for index, node in enumerate(nodes): + base = self._make_qualified(node) + if base not in seen: + seen.add(base) + names.append(base) + continue + candidate = f"{base}:L{node.line_start}" + if candidate in seen: + candidate = f"{base}:L{node.line_start}#{index}" + seen.add(candidate) + names.append(candidate) + return names + def _row_to_node(self, row: sqlite3.Row) -> GraphNode: return GraphNode( id=row["id"], diff --git a/code_review_graph/incremental.py b/code_review_graph/incremental.py index 81dc3026..601de69e 100644 --- a/code_review_graph/incremental.py +++ b/code_review_graph/incremental.py @@ -905,10 +905,18 @@ def full_build( spring_stats = _run_spring_resolver(store) temporal_stats = _run_temporal_resolver(store) + disambiguated = store.find_disambiguated_nodes() + if disambiguated: + logger.info( + "Disambiguated %d duplicate qualified_name(s): %s", + len(disambiguated), ", ".join(disambiguated), + ) + return { "files_parsed": len(files), "total_nodes": total_nodes, "total_edges": total_edges, + "disambiguated_nodes": disambiguated, "errors": errors, "rescript_resolution": rescript_stats, "spring_resolution": spring_stats, @@ -1043,10 +1051,13 @@ def incremental_update( spring_stats = _run_spring_resolver(store) if spring_changed else None temporal_stats = _run_temporal_resolver(store) if spring_changed else None + disambiguated = store.find_disambiguated_nodes() + return { "files_updated": len(all_files), "total_nodes": total_nodes, "total_edges": total_edges, + "disambiguated_nodes": disambiguated, "changed_files": list(changed_files), "dependent_files": list(dependent_files), "errors": errors, diff --git a/tests/test_collisions.py b/tests/test_collisions.py new file mode 100644 index 00000000..90c97a3a --- /dev/null +++ b/tests/test_collisions.py @@ -0,0 +1,78 @@ +"""Same-file qualified_name collisions must not silently drop nodes. + +Same-named symbols in one file (methods of different object literals, repeated +generated helpers) produce the same qualified_name. Before the fix, the insert's +ON CONFLICT(qualified_name) DO UPDATE collapsed them last-writer-wins and the +earlier definitions vanished. The batch store now disambiguates them. +""" + +import tempfile +from pathlib import Path + +from code_review_graph.graph import GraphStore +from code_review_graph.parser import EdgeInfo, NodeInfo + + +def _func(name, line, path="/repo/db.ts", parent=None): + return NodeInfo( + kind="Function", name=name, file_path=path, + line_start=line, line_end=line + 2, language="typescript", + parent_name=parent, + ) + + +class TestQualifiedNameCollisions: + def setup_method(self): + self.tmp = tempfile.NamedTemporaryFile(suffix=".db", delete=False) + self.store = GraphStore(self.tmp.name) + + def teardown_method(self): + self.store.close() + Path(self.tmp.name).unlink(missing_ok=True) + + def test_same_name_methods_both_persist(self): + # Two object literals each defining getById -> empty parent_name -> collide. + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("getById", 10), _func("getById", 20)], [] + ) + assert self.store.get_stats().total_nodes == 2 + # First occurrence keeps the bare key; the second carries a line suffix. + first = self.store.get_node("/repo/db.ts::getById") + assert first is not None and first.line_start == 10 + second = self.store.get_node("/repo/db.ts::getById:L20") + assert second is not None and second.line_start == 20 + + def test_three_same_name_module_functions_persist(self): + nodes = [_func("rootRouteImport", n) for n in (5, 10, 15)] + self.store.store_file_nodes_edges("/repo/routeTree.gen.ts", nodes, []) + assert self.store.get_stats().total_nodes == 3 + + def test_edge_to_bare_key_resolves_to_first_def(self): + # An edge targeting the bare key still names a real node (the first def), + # so collisions never dangle existing edges. + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("getById", 10), _func("getById", 20)], [] + ) + edge = EdgeInfo( + kind="CALLS", source="/repo/api.ts::handler", + target="/repo/db.ts::getById", file_path="/repo/api.ts", line=3, + ) + self.store.store_file_nodes_edges( + "/repo/api.ts", [_func("handler", 1, path="/repo/api.ts")], [edge] + ) + target = self.store.get_node("/repo/db.ts::getById") + assert target is not None and target.line_start == 10 + + def test_find_disambiguated_nodes_lists_suffixed_keys(self): + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("getById", 10), _func("getById", 20)], [] + ) + assert self.store.find_disambiguated_nodes() == ["/repo/db.ts::getById:L20"] + + def test_unique_names_keep_bare_keys_unchanged(self): + self.store.store_file_nodes_edges( + "/repo/db.ts", [_func("a", 10), _func("b", 20)], [] + ) + assert self.store.find_disambiguated_nodes() == [] + assert self.store.get_node("/repo/db.ts::a") is not None + assert self.store.get_node("/repo/db.ts::b") is not None