kantord · vishesh92 · Oct 12, 2025
diff --git a/docs/configuration.md b/docs/configuration.md
@@ -26,16 +26,43 @@ This is an example of a configuration file:
 server:
   port: 31134  # A port number to run the server on
   # Increase number of commits used for computing frecency score
-  # Default is `1000`, set to `null` to read all history
+  # Default is `5000`, set to `null` to read all history
   readMaxCommits: 5000
 
   # globs to ignore in addition to .gitignore
   ignorePatterns:
     - "**/locales/*" # Ignore all files inside 'locales' directories
     - "**/*.po"     # Ignore all gettext translation files
 
+  # ChromaDB vector search configuration
+  chroma:
+    embeddingFunction:
+      name: "DefaultEmbeddingFunction"
+      arguments: {}
+    maxVectorDistance: 1.5  # Maximum vector distance for results (0.1-10.0)
+    maxChunksToFetch: 100   # Maximum chunks to fetch from vector DB (10-1000)
+    nResultsMultiplier: 2   # Multiplier for over-fetching results (1.0-10.0)
+
+  # Ripgrep text search configuration
+  ripgrep:
+    maxFileSize: 200        # Maximum file size to cache in KB (1KB-10MB)
+    maxMmapSize: 500        # Maximum memory-mapped cache size in MB (10MB-10GB)
+
+  # Engine processing configuration
+  engine:
+    minChunksToAnalyze:
+      minValue: 40          # Minimum chunks to analyze (1-1000)
+      percentage: 0.2       # Percentage of total chunks to analyze (0.01-1.0)
+    maxWorkers: 1           # Maximum worker threads (1-32)
+
+  # Query defaults
+  query:
+    defaultLimitClue: 500   # Default result limit (10-10000)
+    defaultContextAbove: 3  # Default context lines above results (0-50)
+    defaultContextBelow: 3  # Default context lines below results (0-50)
+
 client:
-  # Connect the CLI to a remove server
+  # Connect the CLI to a remote server
   host: https://example.com/seagoat-instance/
 
 ```
@@ -61,18 +88,42 @@ you wish to keep in git, but you wish to hide from SeaGOAT.
     * `name`: Name of the embedding function to use.
     See [ChromaDB's docs for more](https://docs.trychroma.com/embeddings)
     * `arguments`: Arguments to pass to the embedding function.
+  * `maxVectorDistance`: Maximum vector distance for results (default: 1.5, range: 0.1-10.0)
+  * `maxChunksToFetch`: Maximum chunks to fetch from vector database (default: 100, range: 10-1000)
+  * `nResultsMultiplier`: Multiplier for over-fetching results (default: 2.0, range: 1.0-10.0)
       * If you wanted to use the `ONNXMiniLM_L6_V2` embedding model with TensorRT
 
         ```yaml
         server:
         ...
         chroma:
-          embedding_function:
+          embeddingFunction:
             name: "ONNXMiniLM_L6_V2"
             arguments:
               preferred_providers: ["TensorrtExecutionProvider"]
+          maxVectorDistance: 1.2
+          maxChunksToFetch: 150
+          nResultsMultiplier: 2.5
         ```
 
+* `ripgrep`: Configurations for the ripgrep text search engine.
+  Has the following attributes:
+  * `maxFileSize`: Maximum file size to cache in KB (default: 200, range: 1-10240 KB)
+  * `maxMmapSize`: Maximum memory-mapped cache size in MB (default: 500, range: 10MB-10GB)
+
+* `engine`: Configurations for the search engine processing.
+  Has the following attributes:
+  * `minChunksToAnalyze`:
+    * `minValue`: Minimum number of chunks to analyze (default: 40, range: 1-1000)
+    * `percentage`: Percentage of total chunks to analyze (default: 0.2, range: 0.01-1.0)
+  * `maxWorkers`: Maximum worker threads for parallel processing (default: 1, range: 1-32)
+
+* `query`: Default values for query parameters.
+  Has the following attributes:
+  * `defaultLimitClue`: Default result limit (default: 500, range: 10-10000)
+  * `defaultContextAbove`: Default context lines above results (default: 3, range: 0-50)
+  * `defaultContextBelow`: Default context lines below results (default: 3, range: 0-50)
+
 ### Client
 
 Configuration for the CLI (`gt` command) resides under the `client` attribute.
@@ -83,3 +134,79 @@ The following values can be configured:
 needed when you are hosting your SeaGOAT server on a remote computer. *It is
 recommended to set this value in your project configuration file, so that
 you are still able to use the local server for different projects.*
+
+## Performance Tuning
+
+### For Large Repositories
+
+For repositories with many files or large codebases, consider these optimizations:
+
+```yaml
+server:
+  # Increase file size limit to include larger files
+  ripgrep:
+    maxFileSize: 1024     # 1MB instead of 200KB
+    maxMmapSize: 2000     # 2GB instead of 500MB
+
+  # Increase processing capacity
+  engine:
+    maxWorkers: 4         # Use more CPU cores
+    minChunksToAnalyze:
+      minValue: 100       # Analyze more chunks initially
+      percentage: 0.3     # Analyze 30% of chunks instead of 20%
+
+  # Optimize vector search
+  chroma:
+    maxChunksToFetch: 200 # Fetch more results for better recall
+    nResultsMultiplier: 3 # Over-fetch more aggressively
+```
+
+### For Faster Initial Analysis
+
+To speed up the initial codebase analysis:
+
+```yaml
+server:
+  engine:
+    minChunksToAnalyze:
+      minValue: 20        # Lower minimum for faster startup
+      percentage: 0.1     # Analyze fewer chunks initially
+```
+
+### For Better Search Quality
+
+To improve search result quality at the cost of some performance:
+
+```yaml
+server:
+  chroma:
+    maxVectorDistance: 2.0    # Accept more distant matches
+    maxChunksToFetch: 300     # Fetch more candidates
+    nResultsMultiplier: 4     # Over-fetch more results
+
+  query:
+    defaultLimitClue: 1000    # Return more results by default
+    defaultContextAbove: 5    # More context lines
+    defaultContextBelow: 5
+```
+
+### For Memory-Constrained Systems
+
+To reduce memory usage:
+
+```yaml
+server:
+  ripgrep:
+    maxFileSize: 100      # 100KB instead of 200KB
+    maxMmapSize: 200      # 200MB instead of 500MB
+
+  engine:
+    maxWorkers: 1         # Single-threaded processing
+    minChunksToAnalyze:
+      minValue: 20        # Analyze fewer chunks
+      percentage: 0.1
+
+  chroma:
+    maxChunksToFetch: 50  # Fetch fewer results
+    nResultsMultiplier: 1.5
+```
diff --git a/seagoat/engine.py b/seagoat/engine.py
@@ -105,8 +105,10 @@ def _create_vector_embeddings(self, minimum_chunks_to_analyze=None):
                     self.cache.data["chunks_not_yet_analyzed"].add(chunk.chunk_id)
 
         if minimum_chunks_to_analyze is None:
+            min_value = self.config["server"]["engine"]["minChunksToAnalyze"]["minValue"]
+            percentage = self.config["server"]["engine"]["minChunksToAnalyze"]["percentage"]
             minimum_chunks_to_analyze = min(
-                max(40, int(len(chunks_to_process) * 0.2)),
+                max(min_value, int(len(chunks_to_process) * percentage)),
                 len(chunks_to_process),
             )
 
@@ -129,7 +131,8 @@ async def query(self, query: str, limit_clue=50, context_above=0, context_below=
         """
 
         self._results = []
-        executor = ThreadPoolExecutor(max_workers=1)
+        max_workers = self.config["server"]["engine"]["maxWorkers"]
+        executor = ThreadPoolExecutor(max_workers=max_workers)
         loop = asyncio.get_event_loop()
         async_tasks = [
             loop.run_in_executor(

diff --git a/seagoat/server.py b/seagoat/server.py
@@ -43,6 +43,9 @@ def create_app(repo_path):
     app = Flask(__name__)
     app.config["PROPAGATE_EXCEPTIONS"] = True
     app.debug = True
+
+    config = get_config_values(Path(repo_path))
+    app.config["seagoat_config"] = config
 
     app.extensions["task_queue"] = TaskQueue(
         repo_path=repo_path, minimum_chunks_to_analyze=0
@@ -65,10 +68,14 @@ def execute_query(limit_clue, **kwargs) -> bytes:
     @app.route("/lines/query", methods=["POST"])
     def query_lines():
         data = request.json
+        config = current_app.config["seagoat_config"]
         query = get_fallback_value(data, "queryText", "")
-        limit_clue = int(get_fallback_value(data, "limitClue", "500"))
-        context_above = int(get_fallback_value(data, "contextAbove", 3))
-        context_below = int(get_fallback_value(data, "contextBelow", 3))
+        default_limit_clue = str(config["server"]["query"]["defaultLimitClue"])
+        default_context_above = str(config["server"]["query"]["defaultContextAbove"])
+        default_context_below = str(config["server"]["query"]["defaultContextBelow"])
+        limit_clue = int(get_fallback_value(data, "limitClue", default_limit_clue))
+        context_above = int(get_fallback_value(data, "contextAbove", default_context_above))
+        context_below = int(get_fallback_value(data, "contextBelow", default_context_below))
 
         return execute_query(
             query=query,
@@ -80,8 +87,10 @@ def query_lines():
     @app.route("/files/query", methods=["POST"])
     def query_files():
         data = request.json
+        config = current_app.config["seagoat_config"]
         query = get_fallback_value(data, "queryText", "")
-        limit_clue = int(get_fallback_value(data, "limitClue", "500"))
+        default_limit_clue = str(config["server"]["query"]["defaultLimitClue"])
+        limit_clue = int(get_fallback_value(data, "limitClue", default_limit_clue))
 
         result = execute_query(
             query=query, context_above=0, context_below=0, limit_clue=limit_clue

diff --git a/seagoat/sources/chroma.py b/seagoat/sources/chroma.py
@@ -10,8 +10,6 @@
 from seagoat.result import Result
 from seagoat.utils.config import get_config_values
 
-MAXIMUM_VECTOR_DISTANCE = 1.5
-
 
 def get_metadata_and_distance_from_chromadb_result(chromadb_results):
     return (
@@ -26,13 +24,14 @@ def get_metadata_and_distance_from_chromadb_result(chromadb_results):
     ) or []
 
 
-def format_results(query_text: str, repository, chromadb_results):
+def format_results(query_text: str, repository, chromadb_results, config):
     files = {}
+    max_vector_distance = config["server"]["chroma"]["maxVectorDistance"]
 
     for metadata, distance in get_metadata_and_distance_from_chromadb_result(
         chromadb_results
     ):
-        if distance > MAXIMUM_VECTOR_DISTANCE:
+        if distance > max_vector_distance:
             break
         path = str(metadata["path"])
         line = int(metadata["line"])
@@ -77,15 +76,16 @@ def initialize(repository: Repository):
 
     def fetch(query_text: str, limit: int):
         # Slightly overfetch results as it will sorted using a different score later
-        maximum_chunks_to_fetch = 100  # this should be plenty, especially because many times context could be included
-        n_results = min((limit + 1) * 2, maximum_chunks_to_fetch)
+        max_chunks_to_fetch = config["server"]["chroma"]["maxChunksToFetch"]
+        n_results_multiplier = config["server"]["chroma"]["nResultsMultiplier"]
+        n_results = min((limit + 1) * n_results_multiplier, max_chunks_to_fetch)
 
         chromadb_results = chroma_collection.query(
             query_texts=[query_text],
             n_results=n_results,
         )
 
-        return format_results(query_text, repository, chromadb_results)
+        return format_results(query_text, repository, chromadb_results, config)
 
     def cache_chunk(chunk):
         try:

diff --git a/seagoat/sources/ripgrep.py b/seagoat/sources/ripgrep.py
@@ -11,15 +11,14 @@
 from seagoat.cache import Cache
 from seagoat.repository import Repository
 from seagoat.result import Result
-from seagoat.sources.chroma import MAXIMUM_VECTOR_DISTANCE
+from seagoat.utils.config import get_config_values
 from seagoat.utils.file_reader import read_file_with_correct_encoding
 from seagoat.utils.file_types import is_file_type_supported
 
 KILOBYTE = 1024
 MEGABYTE = KILOBYTE * 1024
 MAX_MMAP_SIZE = 500
 MAX_MMAP_SIZE_BYTES = MAX_MMAP_SIZE * MEGABYTE
-MAX_FILE_SIZE = 200 * KILOBYTE
 STOP_WORDS = set(get_stop_words("english"))
 
 
@@ -32,16 +31,19 @@ def __init__(self, repository: Repository):
             self.file_path = tempfile.mktemp()
         self.is_initialized = False
         self._data = ""
+        self.config = get_config_values(Path(repository.path))
 
     def _iterate_files_to_cache(self):
         for file, _ in self.repository.top_files():
             yield file
 
     def _iterate_lines_to_cache(self):
+        max_file_size_kb = self.config["server"]["ripgrep"]["maxFileSize"]
+        max_file_size_bytes = max_file_size_kb * KILOBYTE
         for file in self._iterate_files_to_cache():
             file_contents = read_file_with_correct_encoding(file.absolute_path)
 
-            if len(file_contents) > MAX_FILE_SIZE:
+            if len(file_contents) > max_file_size_bytes:
                 logging.warning("Warning: file %s is too large to cache", file.path)
                 continue
 
@@ -55,17 +57,19 @@ def _generate_cache_lines(self):
     def _build_cache_file(self):
         total_estimated_cache_size = 0
         line_count = 0
+        max_mmap_size = self.config["server"]["ripgrep"]["maxMmapSize"]
+        max_mmap_size_bytes = max_mmap_size * MEGABYTE
 
         with open(self.file_path, "w", encoding="utf-8") as cache_file:
             for formattted_cache_line in self._generate_cache_lines():
                 cache_file.write(formattted_cache_line)
                 total_estimated_cache_size += len(formattted_cache_line)
                 line_count += 1
 
-                if total_estimated_cache_size > MAX_MMAP_SIZE_BYTES:
+                if total_estimated_cache_size > max_mmap_size_bytes:
                     logging.warning(
                         "Warning: maximum estimated ripgrep cache size of %s megabytes exceeded",
-                        MAX_MMAP_SIZE,
+                        max_mmap_size,
                     )
 
                     break
@@ -111,6 +115,7 @@ def cache_repo():
         memory_cache.rebuild()
 
     def _fetch(query_text: str, path: str, limit: int, cache: RipGrepCache):
+        config = get_config_values(Path(repository.path))
         query_text_without_stopwords = " ".join(
             query for query in query_text.split(" ") if query not in STOP_WORDS
         )
@@ -145,7 +150,8 @@ def _fetch(query_text: str, path: str, limit: int, cache: RipGrepCache):
                 files[relative_path] = Result(query_text, gitfile)
 
             # This is so that ripgrep results are on comparable levels with chroma results
-            files[relative_path].add_line(line_number, MAXIMUM_VECTOR_DISTANCE * 0.8)
+            max_vector_distance = config["server"]["chroma"]["maxVectorDistance"]
+            files[relative_path].add_line(line_number, max_vector_distance * 0.8)
 
         return files.values()