diff --git a/docs/configuration.md b/docs/configuration.md index c3ef3d85..77232f14 100644 --- a/docs/configuration.md +++ b/docs/configuration.md @@ -26,7 +26,7 @@ This is an example of a configuration file: server: port: 31134 # A port number to run the server on # Increase number of commits used for computing frecency score - # Default is `1000`, set to `null` to read all history + # Default is `5000`, set to `null` to read all history readMaxCommits: 5000 # globs to ignore in addition to .gitignore @@ -34,8 +34,35 @@ server: - "**/locales/*" # Ignore all files inside 'locales' directories - "**/*.po" # Ignore all gettext translation files + # ChromaDB vector search configuration + chroma: + embeddingFunction: + name: "DefaultEmbeddingFunction" + arguments: {} + maxVectorDistance: 1.5 # Maximum vector distance for results (0.1-10.0) + maxChunksToFetch: 100 # Maximum chunks to fetch from vector DB (10-1000) + nResultsMultiplier: 2 # Multiplier for over-fetching results (1.0-10.0) + + # Ripgrep text search configuration + ripgrep: + maxFileSize: 200 # Maximum file size to cache in KB (1KB-10MB) + maxMmapSize: 500 # Maximum memory-mapped cache size in MB (10MB-10GB) + + # Engine processing configuration + engine: + minChunksToAnalyze: + minValue: 40 # Minimum chunks to analyze (1-1000) + percentage: 0.2 # Percentage of total chunks to analyze (0.01-1.0) + maxWorkers: 1 # Maximum worker threads (1-32) + + # Query defaults + query: + defaultLimitClue: 500 # Default result limit (10-10000) + defaultContextAbove: 3 # Default context lines above results (0-50) + defaultContextBelow: 3 # Default context lines below results (0-50) + client: - # Connect the CLI to a remove server + # Connect the CLI to a remote server host: https://example.com/seagoat-instance/ ``` @@ -61,18 +88,42 @@ you wish to keep in git, but you wish to hide from SeaGOAT. * `name`: Name of the embedding function to use. See [ChromaDB's docs for more](https://docs.trychroma.com/embeddings) * `arguments`: Arguments to pass to the embedding function. + * `maxVectorDistance`: Maximum vector distance for results (default: 1.5, range: 0.1-10.0) + * `maxChunksToFetch`: Maximum chunks to fetch from vector database (default: 100, range: 10-1000) + * `nResultsMultiplier`: Multiplier for over-fetching results (default: 2.0, range: 1.0-10.0) * If you wanted to use the `ONNXMiniLM_L6_V2` embedding model with TensorRT ```yaml server: ... chroma: - embedding_function: + embeddingFunction: name: "ONNXMiniLM_L6_V2" arguments: preferred_providers: ["TensorrtExecutionProvider"] + maxVectorDistance: 1.2 + maxChunksToFetch: 150 + nResultsMultiplier: 2.5 ``` +* `ripgrep`: Configurations for the ripgrep text search engine. + Has the following attributes: + * `maxFileSize`: Maximum file size to cache in KB (default: 200, range: 1-10240 KB) + * `maxMmapSize`: Maximum memory-mapped cache size in MB (default: 500, range: 10MB-10GB) + +* `engine`: Configurations for the search engine processing. + Has the following attributes: + * `minChunksToAnalyze`: + * `minValue`: Minimum number of chunks to analyze (default: 40, range: 1-1000) + * `percentage`: Percentage of total chunks to analyze (default: 0.2, range: 0.01-1.0) + * `maxWorkers`: Maximum worker threads for parallel processing (default: 1, range: 1-32) + +* `query`: Default values for query parameters. + Has the following attributes: + * `defaultLimitClue`: Default result limit (default: 500, range: 10-10000) + * `defaultContextAbove`: Default context lines above results (default: 3, range: 0-50) + * `defaultContextBelow`: Default context lines below results (default: 3, range: 0-50) + ### Client Configuration for the CLI (`gt` command) resides under the `client` attribute. @@ -83,3 +134,79 @@ The following values can be configured: needed when you are hosting your SeaGOAT server on a remote computer. *It is recommended to set this value in your project configuration file, so that you are still able to use the local server for different projects.* + +## Performance Tuning + +### For Large Repositories + +For repositories with many files or large codebases, consider these optimizations: + +```yaml +server: + # Increase file size limit to include larger files + ripgrep: + maxFileSize: 1024 # 1MB instead of 200KB + maxMmapSize: 2000 # 2GB instead of 500MB + + # Increase processing capacity + engine: + maxWorkers: 4 # Use more CPU cores + minChunksToAnalyze: + minValue: 100 # Analyze more chunks initially + percentage: 0.3 # Analyze 30% of chunks instead of 20% + + # Optimize vector search + chroma: + maxChunksToFetch: 200 # Fetch more results for better recall + nResultsMultiplier: 3 # Over-fetch more aggressively +``` + +### For Faster Initial Analysis + +To speed up the initial codebase analysis: + +```yaml +server: + engine: + minChunksToAnalyze: + minValue: 20 # Lower minimum for faster startup + percentage: 0.1 # Analyze fewer chunks initially +``` + +### For Better Search Quality + +To improve search result quality at the cost of some performance: + +```yaml +server: + chroma: + maxVectorDistance: 2.0 # Accept more distant matches + maxChunksToFetch: 300 # Fetch more candidates + nResultsMultiplier: 4 # Over-fetch more results + + query: + defaultLimitClue: 1000 # Return more results by default + defaultContextAbove: 5 # More context lines + defaultContextBelow: 5 +``` + +### For Memory-Constrained Systems + +To reduce memory usage: + +```yaml +server: + ripgrep: + maxFileSize: 100 # 100KB instead of 200KB + maxMmapSize: 200 # 200MB instead of 500MB + + engine: + maxWorkers: 1 # Single-threaded processing + minChunksToAnalyze: + minValue: 20 # Analyze fewer chunks + percentage: 0.1 + + chroma: + maxChunksToFetch: 50 # Fetch fewer results + nResultsMultiplier: 1.5 +``` diff --git a/seagoat/engine.py b/seagoat/engine.py index b06f618c..68031651 100644 --- a/seagoat/engine.py +++ b/seagoat/engine.py @@ -105,8 +105,10 @@ def _create_vector_embeddings(self, minimum_chunks_to_analyze=None): self.cache.data["chunks_not_yet_analyzed"].add(chunk.chunk_id) if minimum_chunks_to_analyze is None: + min_value = self.config["server"]["engine"]["minChunksToAnalyze"]["minValue"] + percentage = self.config["server"]["engine"]["minChunksToAnalyze"]["percentage"] minimum_chunks_to_analyze = min( - max(40, int(len(chunks_to_process) * 0.2)), + max(min_value, int(len(chunks_to_process) * percentage)), len(chunks_to_process), ) @@ -129,7 +131,8 @@ async def query(self, query: str, limit_clue=50, context_above=0, context_below= """ self._results = [] - executor = ThreadPoolExecutor(max_workers=1) + max_workers = self.config["server"]["engine"]["maxWorkers"] + executor = ThreadPoolExecutor(max_workers=max_workers) loop = asyncio.get_event_loop() async_tasks = [ loop.run_in_executor( diff --git a/seagoat/server.py b/seagoat/server.py index ef922d0a..99744951 100644 --- a/seagoat/server.py +++ b/seagoat/server.py @@ -43,6 +43,9 @@ def create_app(repo_path): app = Flask(__name__) app.config["PROPAGATE_EXCEPTIONS"] = True app.debug = True + + config = get_config_values(Path(repo_path)) + app.config["seagoat_config"] = config app.extensions["task_queue"] = TaskQueue( repo_path=repo_path, minimum_chunks_to_analyze=0 @@ -65,10 +68,14 @@ def execute_query(limit_clue, **kwargs) -> bytes: @app.route("/lines/query", methods=["POST"]) def query_lines(): data = request.json + config = current_app.config["seagoat_config"] query = get_fallback_value(data, "queryText", "") - limit_clue = int(get_fallback_value(data, "limitClue", "500")) - context_above = int(get_fallback_value(data, "contextAbove", 3)) - context_below = int(get_fallback_value(data, "contextBelow", 3)) + default_limit_clue = str(config["server"]["query"]["defaultLimitClue"]) + default_context_above = str(config["server"]["query"]["defaultContextAbove"]) + default_context_below = str(config["server"]["query"]["defaultContextBelow"]) + limit_clue = int(get_fallback_value(data, "limitClue", default_limit_clue)) + context_above = int(get_fallback_value(data, "contextAbove", default_context_above)) + context_below = int(get_fallback_value(data, "contextBelow", default_context_below)) return execute_query( query=query, @@ -80,8 +87,10 @@ def query_lines(): @app.route("/files/query", methods=["POST"]) def query_files(): data = request.json + config = current_app.config["seagoat_config"] query = get_fallback_value(data, "queryText", "") - limit_clue = int(get_fallback_value(data, "limitClue", "500")) + default_limit_clue = str(config["server"]["query"]["defaultLimitClue"]) + limit_clue = int(get_fallback_value(data, "limitClue", default_limit_clue)) result = execute_query( query=query, context_above=0, context_below=0, limit_clue=limit_clue diff --git a/seagoat/sources/chroma.py b/seagoat/sources/chroma.py index e66fc864..5f88f1dd 100644 --- a/seagoat/sources/chroma.py +++ b/seagoat/sources/chroma.py @@ -10,8 +10,6 @@ from seagoat.result import Result from seagoat.utils.config import get_config_values -MAXIMUM_VECTOR_DISTANCE = 1.5 - def get_metadata_and_distance_from_chromadb_result(chromadb_results): return ( @@ -26,13 +24,14 @@ def get_metadata_and_distance_from_chromadb_result(chromadb_results): ) or [] -def format_results(query_text: str, repository, chromadb_results): +def format_results(query_text: str, repository, chromadb_results, config): files = {} + max_vector_distance = config["server"]["chroma"]["maxVectorDistance"] for metadata, distance in get_metadata_and_distance_from_chromadb_result( chromadb_results ): - if distance > MAXIMUM_VECTOR_DISTANCE: + if distance > max_vector_distance: break path = str(metadata["path"]) line = int(metadata["line"]) @@ -77,15 +76,16 @@ def initialize(repository: Repository): def fetch(query_text: str, limit: int): # Slightly overfetch results as it will sorted using a different score later - maximum_chunks_to_fetch = 100 # this should be plenty, especially because many times context could be included - n_results = min((limit + 1) * 2, maximum_chunks_to_fetch) + max_chunks_to_fetch = config["server"]["chroma"]["maxChunksToFetch"] + n_results_multiplier = config["server"]["chroma"]["nResultsMultiplier"] + n_results = min((limit + 1) * n_results_multiplier, max_chunks_to_fetch) chromadb_results = chroma_collection.query( query_texts=[query_text], n_results=n_results, ) - return format_results(query_text, repository, chromadb_results) + return format_results(query_text, repository, chromadb_results, config) def cache_chunk(chunk): try: diff --git a/seagoat/sources/ripgrep.py b/seagoat/sources/ripgrep.py index 93936c71..a30c5bee 100644 --- a/seagoat/sources/ripgrep.py +++ b/seagoat/sources/ripgrep.py @@ -11,7 +11,7 @@ from seagoat.cache import Cache from seagoat.repository import Repository from seagoat.result import Result -from seagoat.sources.chroma import MAXIMUM_VECTOR_DISTANCE +from seagoat.utils.config import get_config_values from seagoat.utils.file_reader import read_file_with_correct_encoding from seagoat.utils.file_types import is_file_type_supported @@ -19,7 +19,6 @@ MEGABYTE = KILOBYTE * 1024 MAX_MMAP_SIZE = 500 MAX_MMAP_SIZE_BYTES = MAX_MMAP_SIZE * MEGABYTE -MAX_FILE_SIZE = 200 * KILOBYTE STOP_WORDS = set(get_stop_words("english")) @@ -32,16 +31,19 @@ def __init__(self, repository: Repository): self.file_path = tempfile.mktemp() self.is_initialized = False self._data = "" + self.config = get_config_values(Path(repository.path)) def _iterate_files_to_cache(self): for file, _ in self.repository.top_files(): yield file def _iterate_lines_to_cache(self): + max_file_size_kb = self.config["server"]["ripgrep"]["maxFileSize"] + max_file_size_bytes = max_file_size_kb * KILOBYTE for file in self._iterate_files_to_cache(): file_contents = read_file_with_correct_encoding(file.absolute_path) - if len(file_contents) > MAX_FILE_SIZE: + if len(file_contents) > max_file_size_bytes: logging.warning("Warning: file %s is too large to cache", file.path) continue @@ -55,6 +57,8 @@ def _generate_cache_lines(self): def _build_cache_file(self): total_estimated_cache_size = 0 line_count = 0 + max_mmap_size = self.config["server"]["ripgrep"]["maxMmapSize"] + max_mmap_size_bytes = max_mmap_size * MEGABYTE with open(self.file_path, "w", encoding="utf-8") as cache_file: for formattted_cache_line in self._generate_cache_lines(): @@ -62,10 +66,10 @@ def _build_cache_file(self): total_estimated_cache_size += len(formattted_cache_line) line_count += 1 - if total_estimated_cache_size > MAX_MMAP_SIZE_BYTES: + if total_estimated_cache_size > max_mmap_size_bytes: logging.warning( "Warning: maximum estimated ripgrep cache size of %s megabytes exceeded", - MAX_MMAP_SIZE, + max_mmap_size, ) break @@ -111,6 +115,7 @@ def cache_repo(): memory_cache.rebuild() def _fetch(query_text: str, path: str, limit: int, cache: RipGrepCache): + config = get_config_values(Path(repository.path)) query_text_without_stopwords = " ".join( query for query in query_text.split(" ") if query not in STOP_WORDS ) @@ -145,7 +150,8 @@ def _fetch(query_text: str, path: str, limit: int, cache: RipGrepCache): files[relative_path] = Result(query_text, gitfile) # This is so that ripgrep results are on comparable levels with chroma results - files[relative_path].add_line(line_number, MAXIMUM_VECTOR_DISTANCE * 0.8) + max_vector_distance = config["server"]["chroma"]["maxVectorDistance"] + files[relative_path].add_line(line_number, max_vector_distance * 0.8) return files.values() diff --git a/seagoat/utils/config.py b/seagoat/utils/config.py index 01a5ef08..9cac5e3c 100644 --- a/seagoat/utils/config.py +++ b/seagoat/utils/config.py @@ -19,6 +19,25 @@ "name": "DefaultEmbeddingFunction", "arguments": {}, }, + "maxVectorDistance": 1.5, + "maxChunksToFetch": 100, + "nResultsMultiplier": 2, + }, + "ripgrep": { + "maxFileSize": 200, # 200 KB + "maxMmapSize": 500, # 500 MB + }, + "engine": { + "minChunksToAnalyze": { + "minValue": 40, + "percentage": 0.2, + }, + "maxWorkers": 1, + }, + "query": { + "defaultLimitClue": 500, + "defaultContextAbove": 3, + "defaultContextBelow": 3, }, }, "client": { @@ -57,6 +76,41 @@ "arguments": {"type": "object"}, }, }, + "maxVectorDistance": {"type": "number", "minimum": 0.1, "maximum": 10.0}, + "maxChunksToFetch": {"type": "integer", "minimum": 10, "maximum": 1000}, + "nResultsMultiplier": {"type": "number", "minimum": 1.0, "maximum": 10.0}, + }, + }, + "ripgrep": { + "type": "object", + "additionalProperties": False, + "properties": { + "maxFileSize": {"type": "integer", "minimum": 1, "maximum": 10240}, # 1KB to 10MB + "maxMmapSize": {"type": "integer", "minimum": 10, "maximum": 10000}, # 10MB to 10GB + }, + }, + "engine": { + "type": "object", + "additionalProperties": False, + "properties": { + "minChunksToAnalyze": { + "type": "object", + "additionalProperties": False, + "properties": { + "minValue": {"type": "integer", "minimum": 1, "maximum": 1000}, + "percentage": {"type": "number", "minimum": 0.01, "maximum": 1.0}, + }, + }, + "maxWorkers": {"type": "integer", "minimum": 1, "maximum": 32}, + }, + }, + "query": { + "type": "object", + "additionalProperties": False, + "properties": { + "defaultLimitClue": {"type": "integer", "minimum": 10, "maximum": 10000}, + "defaultContextAbove": {"type": "integer", "minimum": 0, "maximum": 50}, + "defaultContextBelow": {"type": "integer", "minimum": 0, "maximum": 50}, }, }, },