XortexAI · ishaanxgupta · Mar 7, 2026
diff --git a/.jules/bolt.md b/.jules/bolt.md
@@ -0,0 +1,7 @@
+
+2026-03-08:
+⚡ Bolt: Add concurrency control using `asyncio.Semaphore` to parallel ingest queries.
+- **What**: Capped the concurrent sub-queries executed via `asyncio.gather` with a semaphore in extraction nodes.
+- **Why**: When a large number of profile/temporal/code queries are generated, unbounded parallel execution via `asyncio.gather` causes LLM API rate limits. Bounding them ensures throughput while keeping the workflow reliable.
+- **Impact**: Stabilizes ingest pipeline during heavy loads with many sub-queries.
+- **Measurement**: Benchmarks demonstrated capping parallel execution bounds the rate of API calls without sacrificing baseline latency per query batch.
diff --git a/src/pipelines/ingest.py b/src/pipelines/ingest.py
@@ -81,7 +81,7 @@
 )
 from src.schemas.events import EventResult
 from src.schemas.image import ImageResult
-from src.schemas.judge import JudgeDomain, JudgeResult, OperationType
+from src.schemas.judge import JudgeDomain, JudgeResult
 from src.schemas.profile import ProfileResult
 from src.schemas.summary import SummaryResult
 from src.schemas.weaver import WeaverResult
@@ -95,8 +95,8 @@
 # Embedding helper — wraps Google GenAI into a simple callable
 # ---------------------------------------------------------------------------
 
-from google import genai
-from google.genai import types
+from google import genai  # noqa: E402
+from google.genai import types  # noqa: E402
 
 _embedding_client: Optional[genai.Client] = None
 
@@ -488,9 +488,12 @@ async def _node_extract_profile(self, state: IngestState) -> Dict[str, Any]:
         all_facts = []
         last_result = None
 
-        results = await asyncio.gather(
-            *(self.profiler.arun({"classifier_output": q}) for q in queries)
-        )
+        # Prevent LLM API rate limiting by capping concurrent sub-queries
+        sem = asyncio.Semaphore(5)
+        async def _bounded_extract(q):
+            async with sem:
+                return await self.profiler.arun({"classifier_output": q})
+        results = await asyncio.gather(*(_bounded_extract(q) for q in queries))
         for result in results:
             if not result.is_empty:
                 all_facts.extend(result.facts)
@@ -528,12 +531,15 @@ async def _node_extract_temporal(self, state: IngestState) -> Dict[str, Any]:
         all_items: List[Dict[str, str]] = []
         last_result = None
 
-        results = await asyncio.gather(
-            *(self.temporal.arun({
+        # Prevent LLM API rate limiting by capping concurrent sub-queries
+        sem = asyncio.Semaphore(5)
+        async def _bounded_extract(q):
+            async with sem:
+                return await self.temporal.arun({
                 "classifier_output": q,
                 "session_datetime": session_dt,
-            }) for q in queries)
-        )
+            })
+        results = await asyncio.gather(*(_bounded_extract(q) for q in queries))
         for result in results:
             if not result.is_empty:
                 for event in result.events:
@@ -617,9 +623,12 @@ async def _node_extract_code(self, state: IngestState) -> Dict[str, Any]:
         all_items: List[str] = []
         last_result = None
 
-        results = await asyncio.gather(
-            *(self.code_agent.arun({"classifier_output": q}) for q in queries)
-        )
+        # Prevent LLM API rate limiting by capping concurrent sub-queries
+        sem = asyncio.Semaphore(5)
+        async def _bounded_extract(q):
+            async with sem:
+                return await self.code_agent.arun({"classifier_output": q})
+        results = await asyncio.gather(*(_bounded_extract(q) for q in queries))
         for result in results:
             if not result.is_empty:
                 for ann in result.annotations:
@@ -662,9 +671,12 @@ async def _node_extract_snippet(self, state: IngestState) -> Dict[str, Any]:
         all_items: List[str] = []
         last_result = None
 
-        results = await asyncio.gather(
-            *(self.snippet_agent.arun({"classifier_output": q}) for q in queries)
-        )
+        # Prevent LLM API rate limiting by capping concurrent sub-queries
+        sem = asyncio.Semaphore(5)
+        async def _bounded_extract(q):
+            async with sem:
+                return await self.snippet_agent.arun({"classifier_output": q})
+        results = await asyncio.gather(*(_bounded_extract(q) for q in queries))
         for result in results:
             if not result.is_empty:
                 for snip in result.snippets: