diff --git a/seagoat/sources/ripgrep.py b/seagoat/sources/ripgrep.py index 93936c71..0344713b 100644 --- a/seagoat/sources/ripgrep.py +++ b/seagoat/sources/ripgrep.py @@ -86,8 +86,20 @@ def rebuild(self): with open(self.file_path, encoding="utf-8") as cache_file: self._data = cache_file.read() else: + # On POSIX systems, mmap() cannot map a zero-length file. + # If the cache file is empty, avoid mmap and use an empty bytes buffer instead. + # See Issue #945 for details. with open(self.file_path, "r+b") as cache_file: - self._data = mmap.mmap(cache_file.fileno(), 0) + try: + cache_file.seek(0, 2) # move to end to determine size + size = cache_file.tell() + cache_file.seek(0) + except Exception: + size = 0 + if size == 0: + self._data = b"" + else: + self._data = mmap.mmap(cache_file.fileno(), 0) def encode(self, *args, **kwargs): # type: ignore return self._data @@ -128,8 +140,28 @@ def _fetch(query_text: str, path: str, limit: int, cache: RipGrepCache): ] try: + # Determine safe text input for subprocess + try: + raw = cache.encode("utf-8") # RipGrepCache overrides encode() + except Exception: + raw = cache.as_input() + + if isinstance(raw, (bytes, bytearray)): + cache_input_text = raw.decode("utf-8", errors="ignore") + elif hasattr(raw, "read"): + # Likely an mmap object + try: + b = bytes(raw) + except Exception: + b = b"" + cache_input_text = b.decode("utf-8", errors="ignore") + elif isinstance(raw, str): + cache_input_text = raw + else: + cache_input_text = str(raw) + rg_output = subprocess.check_output( - cmd, encoding="utf-8", input=cache.as_input() + cmd, encoding="utf-8", input=cache_input_text ) except subprocess.CalledProcessError as exception: rg_output = exception.output diff --git a/tests/test_issue_945.py b/tests/test_issue_945.py new file mode 100644 index 00000000..e962aacc --- /dev/null +++ b/tests/test_issue_945.py @@ -0,0 +1,65 @@ +from pathlib import Path + +import pytest + +from seagoat.repository import Repository +from seagoat.sources.ripgrep import initialize + + +@pytest.fixture(name="initialize_empty_source") +def _initialize_empty_source(repo): + # Remove any pre-populated supported files so the ripgrep cache ends up empty + for filename in ["file1.md", "file2.py", "file3.py", "file4.js", "file4.md"]: + p = Path(repo.working_dir) / filename + if p.exists(): + try: + p.unlink() + except FileNotFoundError: + pass + + # Optionally add an unsupported file to ensure repository contains files but none supported + unsupported = Path(repo.working_dir) / "rock.mp3" + unsupported.write_text("12345", encoding="utf-8") + + def _init(): + my_repo = Repository(repo.working_dir) + my_repo.analyze_files() + source = initialize(my_repo) + # This should build an empty cache file and must not raise due to mmap on empty files + source["cache_repo"]() + return source["fetch"] + + return _init + + +def test_mmap_empty_file_issue_945(initialize_empty_source): + fetch = initialize_empty_source() + # Should not raise and should return no results for empty input cache + results = list(fetch("anything", limit=10)) + assert results == [] + + +def test_non_empty_mapping_unchanged(repo): + contents = """ +hello foo bar baz +hello foo bar baz 23 +""" + repo.add_file_change_commit( + file_name="sample.txt", + contents=contents, + author=repo.actors["John Doe"], + commit_message="Add sample text", + ) + + my_repo = Repository(repo.working_dir) + my_repo.analyze_files() + source = initialize(my_repo) + source["cache_repo"]() + fetch = source["fetch"] + + fetched_results = list(fetch("baz|23", limit=100)) + # Behavior for non-empty files should remain intact + assert len(fetched_results) == 1 + file = fetched_results[0] + assert file.gitfile.path == "sample.txt" + assert set(file.lines) != set() # we should have some matching lines