Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
36 changes: 34 additions & 2 deletions seagoat/sources/ripgrep.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,20 @@ def rebuild(self):
with open(self.file_path, encoding="utf-8") as cache_file:
self._data = cache_file.read()
else:
# On POSIX systems, mmap() cannot map a zero-length file.
# If the cache file is empty, avoid mmap and use an empty bytes buffer instead.
# See Issue #945 for details.
with open(self.file_path, "r+b") as cache_file:
self._data = mmap.mmap(cache_file.fileno(), 0)
try:
cache_file.seek(0, 2) # move to end to determine size
size = cache_file.tell()
cache_file.seek(0)
except Exception:
size = 0
if size == 0:
self._data = b""
else:
self._data = mmap.mmap(cache_file.fileno(), 0)

def encode(self, *args, **kwargs): # type: ignore
return self._data
Expand Down Expand Up @@ -128,8 +140,28 @@ def _fetch(query_text: str, path: str, limit: int, cache: RipGrepCache):
]

try:
# Determine safe text input for subprocess
try:
raw = cache.encode("utf-8") # RipGrepCache overrides encode()
except Exception:
raw = cache.as_input()

if isinstance(raw, (bytes, bytearray)):
cache_input_text = raw.decode("utf-8", errors="ignore")
elif hasattr(raw, "read"):
# Likely an mmap object
try:
b = bytes(raw)
except Exception:
b = b""
cache_input_text = b.decode("utf-8", errors="ignore")
elif isinstance(raw, str):
cache_input_text = raw
else:
cache_input_text = str(raw)

rg_output = subprocess.check_output(
cmd, encoding="utf-8", input=cache.as_input()
cmd, encoding="utf-8", input=cache_input_text
)
except subprocess.CalledProcessError as exception:
rg_output = exception.output
Expand Down
65 changes: 65 additions & 0 deletions tests/test_issue_945.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,65 @@
from pathlib import Path

import pytest

from seagoat.repository import Repository
from seagoat.sources.ripgrep import initialize


@pytest.fixture(name="initialize_empty_source")
def _initialize_empty_source(repo):
# Remove any pre-populated supported files so the ripgrep cache ends up empty
for filename in ["file1.md", "file2.py", "file3.py", "file4.js", "file4.md"]:
p = Path(repo.working_dir) / filename
if p.exists():
try:
p.unlink()
except FileNotFoundError:
pass

# Optionally add an unsupported file to ensure repository contains files but none supported
unsupported = Path(repo.working_dir) / "rock.mp3"
unsupported.write_text("12345", encoding="utf-8")

def _init():
my_repo = Repository(repo.working_dir)
my_repo.analyze_files()
source = initialize(my_repo)
# This should build an empty cache file and must not raise due to mmap on empty files
source["cache_repo"]()
return source["fetch"]

return _init


def test_mmap_empty_file_issue_945(initialize_empty_source):
fetch = initialize_empty_source()
# Should not raise and should return no results for empty input cache
results = list(fetch("anything", limit=10))
assert results == []


def test_non_empty_mapping_unchanged(repo):
contents = """
hello foo bar baz
hello foo bar baz 23
"""
repo.add_file_change_commit(
file_name="sample.txt",
contents=contents,
author=repo.actors["John Doe"],
commit_message="Add sample text",
)

my_repo = Repository(repo.working_dir)
my_repo.analyze_files()
source = initialize(my_repo)
source["cache_repo"]()
fetch = source["fetch"]

fetched_results = list(fetch("baz|23", limit=100))
# Behavior for non-empty files should remain intact
assert len(fetched_results) == 1
file = fetched_results[0]
assert file.gitfile.path == "sample.txt"
assert set(file.lines) != set() # we should have some matching lines