From b383adae880d70ad906caed507c4700aeb81ac59 Mon Sep 17 00:00:00 2001 From: ishaanxgupta <124028055+ishaanxgupta@users.noreply.github.com> Date: Thu, 19 Mar 2026 16:18:19 +0000 Subject: [PATCH] =?UTF-8?q?=E2=9A=A1=20Bolt:=20Batch=20multi-repository=20?= =?UTF-8?q?searches=20in=20CodeRetrievalPipeline?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Refactored `_search_symbols` and `_search_files` to execute concurrent requests via `asyncio.gather` when querying across multiple repositories. Also fixed a lint error regarding an unused import (`directories_namespace`). --- .jules/bolt.md | 4 ++++ src/pipelines/code_retrieval.py | 37 +++++++++++++++++++++++++-------- 2 files changed, 32 insertions(+), 9 deletions(-) create mode 100644 .jules/bolt.md diff --git a/.jules/bolt.md b/.jules/bolt.md new file mode 100644 index 0000000..dde1e4b --- /dev/null +++ b/.jules/bolt.md @@ -0,0 +1,4 @@ + +## 2024-05-20 - Batch multi-repository searches in pipelines +**Learning:** In pipelines performing multi-repository operations (like `CodeRetrievalPipeline`), sequential iterations over configured repositories for asynchronous tasks (e.g., `await self._search_namespace`) introduce an N+1 query overhead. The performance degrades linearly with the number of attached repositories. +**Action:** When working with multiple repositories concurrently, avoid sequential bottlenecks by batching operations using `asyncio.gather(*tasks)` mapped over async helpers and flattening the results, exactly as optimized in `src/pipelines/code_retrieval.py` for `_search_symbols` and `_search_files`. diff --git a/src/pipelines/code_retrieval.py b/src/pipelines/code_retrieval.py index 69e4abc..e2511df 100644 --- a/src/pipelines/code_retrieval.py +++ b/src/pipelines/code_retrieval.py @@ -25,6 +25,7 @@ from __future__ import annotations +import asyncio import logging from typing import Any, Callable, Dict, List, Optional @@ -37,7 +38,6 @@ from src.scanner.code_store import CodeStore from src.schemas.code import ( annotations_namespace, - directories_namespace, files_namespace, snippets_namespace, symbols_namespace, @@ -589,14 +589,24 @@ async def _search_symbols( ) -> List[SourceRecord]: if not repo: logger.warning("search_symbols called without repo — searching all repos") - results = [] - for r in self.repos: - results.extend(await self._search_namespace( + + # Fetch from all repos concurrently + tasks = [ + self._search_namespace( namespace=symbols_namespace(self.org_id, r), query=query, domain="symbol", top_k=top_k, - )) + ) + for r in self.repos + ] + results_list = await asyncio.gather(*tasks) + + # Flatten results + results = [] + for res in results_list: + results.extend(res) + return results[:top_k] return await self._search_namespace( @@ -612,14 +622,23 @@ async def _search_files( self, query: str, repo: str, top_k: int = 10, ) -> List[SourceRecord]: if not repo: - results = [] - for r in self.repos: - results.extend(await self._search_namespace( + # Fetch from all repos concurrently + tasks = [ + self._search_namespace( namespace=files_namespace(self.org_id, r), query=query, domain="file", top_k=top_k, - )) + ) + for r in self.repos + ] + results_list = await asyncio.gather(*tasks) + + # Flatten results + results = [] + for res in results_list: + results.extend(res) + return results[:top_k] return await self._search_namespace(