From d264bcd1c338703ef4f551e333b6fe6c0874bf83 Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 14:23:41 -0600
Subject: [PATCH 01/10] feat: wire CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL to
 auto-compact the main agent context

Problem
-------
CAI_SUPPORT_MODEL and CAI_SUPPORT_INTERVAL were documented environment
variables that had no runtime implementation.  The support/reasoner agent
was constructed using CAI_SUPPORT_MODEL but was never invoked automatically.
CAI_SUPPORT_INTERVAL existed only in docs/config tables with no scheduler
reading it.  As a result, users with a limited context window (e.g. 32k on a
local llama.cpp setup) had no way to automatically keep the main model's
message_history from overflowing during a long pentest.

Solution
--------
Added an auto-compact scheduler block immediately after turn_count += 1 in
the main single-agent run loop (run_cai_cli).

When both CAI_SUPPORT_MODEL and CAI_SUPPORT_INTERVAL are set the scheduler:

1. Fires every CAI_SUPPORT_INTERVAL turns (modulo check).
2. Calls COMPACT_COMMAND_INSTANCE._perform_compaction(model_override=
   CAI_SUPPORT_MODEL) which:
   - Sends the full message_history to the support model for summarisation.
   - Clears message_history entirely (hard context reset).
   - Saves the summary to /memory as a .md file.
   - Stores the summary in COMPACTED_SUMMARIES under the agent name.
3. Re-syncs the local agent variable from AGENT_MANAGER.get_active_agent()
   so the run loop continues with the freshly reloaded agent instance whose
   system prompt already contains the injected summary (the system prompt
   template calls get_compacted_summary() dynamically on every turn so no
   extra wiring was needed).
4. Prints a visible yellow/green indicator so users can see when compaction
   fires and confirm the context window has been reset.
5. Silently swallows errors (only logs when CAI_DEBUG=2) so a failing support
   model never crashes the main session.

Usage
-----
  CAI_SUPPORT_MODEL="openai/support"  # lighter model on litellm proxy
  CAI_SUPPORT_INTERVAL=4              # compact every 4 turns
---
 src/cai/cli.py | 35 +++++++++++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index 8cacf5258..abe48d903 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -1711,6 +1711,41 @@ async def process_streamed_response(agent, conversation_input):
                 agent.model.message_history[:] = fix_message_list(agent.model.message_history)
             turn_count += 1
 
+            # Auto-compact: when CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL are both set,
+            # compact the conversation every N turns using the support model so the
+            # main model's context window is kept small.  After compaction the
+            # summary is injected into the agent's system prompt and the local
+            # agent reference is refreshed so the loop uses the reloaded instance.
+            _support_model = os.getenv("CAI_SUPPORT_MODEL")
+            _support_interval_raw = os.getenv("CAI_SUPPORT_INTERVAL")
+            if _support_model and _support_interval_raw:
+                try:
+                    _support_interval = int(_support_interval_raw)
+                    if _support_interval > 0 and turn_count % _support_interval == 0:
+                        from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
+                        console.print(
+                            f"\n[bold yellow]⟳ Auto-compact: turn {turn_count} "
+                            f"(every {_support_interval} turns) — "
+                            f"summarising with {_support_model}[/bold yellow]"
+                        )
+                        COMPACT_COMMAND_INSTANCE._perform_compaction(
+                            model_override=_support_model
+                        )
+                        # Re-sync the local agent reference so the loop continues
+                        # with the freshly reloaded agent (history cleared, memory
+                        # summary already injected into its system prompt).
+                        from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
+                        _reloaded = _AM.get_active_agent()
+                        if _reloaded is not None:
+                            agent = _reloaded
+                        console.print(
+                            "[bold green]✓ Memory summary applied to agent system prompt — "
+                            "context window reset[/bold green]\n"
+                        )
+                except (ValueError, Exception) as _e:
+                    if os.getenv("CAI_DEBUG", "1") == "2":
+                        console.print(f"[red]Auto-compact error: {_e}[/red]")
+
             # Stop measuring active time and start measuring idle time again
             stop_active_timer()
             start_idle_timer()

From 5ffdf208a661a14a33051c3db748d825e81abe54 Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 14:37:19 -0600
Subject: [PATCH 02/10] fix: resume agent task after auto-compact instead of
 dropping to prompt

After auto-compact fired the main run loop fell back to get_user_input()
and waited for the human, so the agent stopped working mid-task.

Root cause: no mechanism existed to replay the current task into the next
iteration after message_history was cleared.

Fix:
- Add _post_compact_input (str | None) variable initialised to None at
  session start.
- Capture _last_user_input from user_input just before turn_count += 1.
- After a successful auto-compact set _post_compact_input to
  _last_user_input (or 'Continue the current task.' if it was blank).
- At the top of the input-gathering block, consume _post_compact_input
  before calling get_user_input() so the agent immediately re-runs with
  its previous task prompt and the fresh compacted context.
---
 src/cai/cli.py | 20 +++++++++++++++++++-
 1 file changed, 19 insertions(+), 1 deletion(-)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index abe48d903..a4ce94000 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -442,6 +442,9 @@ def run_cai_cli(
     agent = starting_agent
     turn_count = 0
     idle_time = 0
+    # Holds a user message to replay on the next iteration without prompting
+    # the user — set by auto-compact so the agent continues its current task.
+    _post_compact_input: str | None = None
     console = Console()
     last_model = os.getenv("CAI_MODEL", "alias1")
     last_agent_type = os.getenv("CAI_AGENT_TYPE", "one_tool_agent")
@@ -690,6 +693,11 @@ def get_agent_short_name(agent):
                 if use_initial_prompt:
                     user_input = initial_prompt
                     use_initial_prompt = False  # Only use it once
+                elif _post_compact_input is not None:
+                    # Auto-compact just ran — replay the last task so the agent
+                    # continues working without waiting for human input.
+                    user_input = _post_compact_input
+                    _post_compact_input = None
                 else:
                     # Get user input with command completion and history
                     user_input = get_user_input(
@@ -1711,6 +1719,9 @@ async def process_streamed_response(agent, conversation_input):
                 agent.model.message_history[:] = fix_message_list(agent.model.message_history)
             turn_count += 1
 
+            # Capture user_input here so auto-compact can replay it after clearing history.
+            _last_user_input = user_input if isinstance(user_input, str) else ""
+
             # Auto-compact: when CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL are both set,
             # compact the conversation every N turns using the support model so the
             # main model's context window is kept small.  After compaction the
@@ -1738,9 +1749,16 @@ async def process_streamed_response(agent, conversation_input):
                         _reloaded = _AM.get_active_agent()
                         if _reloaded is not None:
                             agent = _reloaded
+                        # Queue the last user task to be replayed on the next
+                        # iteration so the agent continues without human input.
+                        _post_compact_input = (
+                            _last_user_input
+                            if _last_user_input.strip()
+                            else "Continue the current task."
+                        )
                         console.print(
                             "[bold green]✓ Memory summary applied to agent system prompt — "
-                            "context window reset[/bold green]\n"
+                            "context window reset — continuing task[/bold green]\n"
                         )
                 except (ValueError, Exception) as _e:
                     if os.getenv("CAI_DEBUG", "1") == "2":

From ce201bc4e19e4416e68472a3f8fb394c2740a7ca Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 14:54:26 -0600
Subject: [PATCH 03/10] fix: add auto-compact diagnostics and unhide errors
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Print startup banner confirming CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL
  are loaded so users can verify env vars are picked up.
- Show a per-turn countdown (dim cyan) so it is visible the interval is
  counting correctly toward the next compact.
- Remove the CAI_DEBUG=2 gate on error output — auto-compact errors are
  now always printed so silent failures can no longer mask a broken
  support model endpoint or compaction issue.
---
 src/cai/cli.py | 79 +++++++++++++++++++++++++++++++-------------------
 1 file changed, 49 insertions(+), 30 deletions(-)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index a4ce94000..e68de5235 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -485,6 +485,18 @@ def run_cai_cli(
     print("\n")
     display_quick_guide(console)
 
+    # Notify user if auto-compact is active so they can confirm the vars loaded.
+    _sc_model_startup = os.getenv("CAI_SUPPORT_MODEL")
+    _sc_interval_startup = os.getenv("CAI_SUPPORT_INTERVAL")
+    if _sc_model_startup and _sc_interval_startup:
+        try:
+            console.print(
+                f"[bold cyan]🗜  Auto-compact enabled: every {int(_sc_interval_startup)} turns "
+                f"using {_sc_model_startup}[/bold cyan]"
+            )
+        except ValueError:
+            pass
+
     # Function to get the short name of the agent for display
     def get_agent_short_name(agent):
         if hasattr(agent, "name"):
@@ -1732,37 +1744,44 @@ async def process_streamed_response(agent, conversation_input):
             if _support_model and _support_interval_raw:
                 try:
                     _support_interval = int(_support_interval_raw)
-                    if _support_interval > 0 and turn_count % _support_interval == 0:
-                        from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
-                        console.print(
-                            f"\n[bold yellow]⟳ Auto-compact: turn {turn_count} "
-                            f"(every {_support_interval} turns) — "
-                            f"summarising with {_support_model}[/bold yellow]"
-                        )
-                        COMPACT_COMMAND_INSTANCE._perform_compaction(
-                            model_override=_support_model
-                        )
-                        # Re-sync the local agent reference so the loop continues
-                        # with the freshly reloaded agent (history cleared, memory
-                        # summary already injected into its system prompt).
-                        from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
-                        _reloaded = _AM.get_active_agent()
-                        if _reloaded is not None:
-                            agent = _reloaded
-                        # Queue the last user task to be replayed on the next
-                        # iteration so the agent continues without human input.
-                        _post_compact_input = (
-                            _last_user_input
-                            if _last_user_input.strip()
-                            else "Continue the current task."
-                        )
-                        console.print(
-                            "[bold green]✓ Memory summary applied to agent system prompt — "
-                            "context window reset — continuing task[/bold green]\n"
-                        )
+                    if _support_interval > 0:
+                        _turns_until = _support_interval - (turn_count % _support_interval)
+                        if _turns_until != _support_interval:  # don't show when just compacted
+                            console.print(
+                                f"[dim cyan]  ↻ auto-compact in {_turns_until} turn(s) "
+                                f"[turn {turn_count}/{_support_interval}×{turn_count // _support_interval + 1}][/dim cyan]"
+                            )
+                        if turn_count % _support_interval == 0:
+                            from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
+                            console.print(
+                                f"\n[bold yellow]⟳ Auto-compact: turn {turn_count} "
+                                f"(every {_support_interval} turns) — "
+                                f"summarising with {_support_model}[/bold yellow]"
+                            )
+                            COMPACT_COMMAND_INSTANCE._perform_compaction(
+                                model_override=_support_model
+                            )
+                            # Re-sync the local agent reference so the loop continues
+                            # with the freshly reloaded agent (history cleared, memory
+                            # summary already injected into its system prompt).
+                            from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
+                            _reloaded = _AM.get_active_agent()
+                            if _reloaded is not None:
+                                agent = _reloaded
+                            # Queue the last user task to be replayed on the next
+                            # iteration so the agent continues without human input.
+                            _post_compact_input = (
+                                _last_user_input
+                                if _last_user_input.strip()
+                                else "Continue the current task."
+                            )
+                            console.print(
+                                "[bold green]✓ Memory summary applied to agent system prompt — "
+                                "context window reset — continuing task[/bold green]\n"
+                            )
                 except (ValueError, Exception) as _e:
-                    if os.getenv("CAI_DEBUG", "1") == "2":
-                        console.print(f"[red]Auto-compact error: {_e}[/red]")
+                    # Always show auto-compact errors so they are never silently lost.
+                    console.print(f"[red]Auto-compact error: {_e}[/red]")
 
             # Stop measuring active time and start measuring idle time again
             stop_active_timer()

From 1491a4d0ad2f648e8699e525652ba84edadd58cb Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 15:04:25 -0600
Subject: [PATCH 04/10] fix: count LLM responses not outer-loop turns for
 auto-compact interval

CAI_SUPPORT_INTERVAL previously counted outer while-loop iterations
(one per user input).  In agentic/continue mode the agent makes many
tool-call rounds per single user input, so the interval never fired
unless the user typed N separate messages.

Fix: count assistant messages in message_history as a proxy for LLM
API calls.  This fires after every N responses from the main model
regardless of how many came from a single outer iteration.

- Startup banner updated: 'every N LLM responses' instead of 'turns'
- Countdown shows [{current}/{threshold}] response counts
- Fire condition: llm_call_count >= support_interval (fires as soon
  as threshold is reached; resets naturally when history is cleared
  after compact)
---
 src/cai/cli.py | 86 +++++++++++++++++++++++++++-----------------------
 1 file changed, 47 insertions(+), 39 deletions(-)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index e68de5235..8515586c2 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -491,7 +491,7 @@ def run_cai_cli(
     if _sc_model_startup and _sc_interval_startup:
         try:
             console.print(
-                f"[bold cyan]🗜  Auto-compact enabled: every {int(_sc_interval_startup)} turns "
+                f"[bold cyan]🗜  Auto-compact enabled: every {int(_sc_interval_startup)} LLM responses "
                 f"using {_sc_model_startup}[/bold cyan]"
             )
         except ValueError:
@@ -1735,50 +1735,58 @@ async def process_streamed_response(agent, conversation_input):
             _last_user_input = user_input if isinstance(user_input, str) else ""
 
             # Auto-compact: when CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL are both set,
-            # compact the conversation every N turns using the support model so the
-            # main model's context window is kept small.  After compaction the
-            # summary is injected into the agent's system prompt and the local
-            # agent reference is refreshed so the loop uses the reloaded instance.
+            # compact the conversation every N LLM *responses* (assistant messages in
+            # history) using the support model.  Counting assistant messages rather
+            # than outer-loop turns means agentic sessions — where the agent makes
+            # many tool-call rounds per single user input — are handled correctly.
             _support_model = os.getenv("CAI_SUPPORT_MODEL")
             _support_interval_raw = os.getenv("CAI_SUPPORT_INTERVAL")
             if _support_model and _support_interval_raw:
                 try:
                     _support_interval = int(_support_interval_raw)
                     if _support_interval > 0:
-                        _turns_until = _support_interval - (turn_count % _support_interval)
-                        if _turns_until != _support_interval:  # don't show when just compacted
-                            console.print(
-                                f"[dim cyan]  ↻ auto-compact in {_turns_until} turn(s) "
-                                f"[turn {turn_count}/{_support_interval}×{turn_count // _support_interval + 1}][/dim cyan]"
-                            )
-                        if turn_count % _support_interval == 0:
-                            from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
-                            console.print(
-                                f"\n[bold yellow]⟳ Auto-compact: turn {turn_count} "
-                                f"(every {_support_interval} turns) — "
-                                f"summarising with {_support_model}[/bold yellow]"
-                            )
-                            COMPACT_COMMAND_INSTANCE._perform_compaction(
-                                model_override=_support_model
-                            )
-                            # Re-sync the local agent reference so the loop continues
-                            # with the freshly reloaded agent (history cleared, memory
-                            # summary already injected into its system prompt).
-                            from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
-                            _reloaded = _AM.get_active_agent()
-                            if _reloaded is not None:
-                                agent = _reloaded
-                            # Queue the last user task to be replayed on the next
-                            # iteration so the agent continues without human input.
-                            _post_compact_input = (
-                                _last_user_input
-                                if _last_user_input.strip()
-                                else "Continue the current task."
-                            )
-                            console.print(
-                                "[bold green]✓ Memory summary applied to agent system prompt — "
-                                "context window reset — continuing task[/bold green]\n"
-                            )
+                        # Count assistant messages as a proxy for LLM API calls.
+                        _history = getattr(getattr(agent, 'model', None), 'message_history', [])
+                        _llm_call_count = sum(
+                            1 for m in _history
+                            if (m.get("role") if isinstance(m, dict) else getattr(m, "role", None))
+                            == "assistant"
+                        )
+                        if _llm_call_count > 0:
+                            _calls_until = max(0, _support_interval - _llm_call_count)
+                            if _calls_until > 0:
+                                console.print(
+                                    f"[dim cyan]  ↻ auto-compact in {_calls_until} LLM response(s) "
+                                    f"[{_llm_call_count}/{_support_interval}][/dim cyan]"
+                                )
+                            if _llm_call_count >= _support_interval:
+                                from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
+                                console.print(
+                                    f"\n[bold yellow]⟳ Auto-compact: {_llm_call_count} LLM responses "
+                                    f"(threshold {_support_interval}) — "
+                                    f"summarising with {_support_model}[/bold yellow]"
+                                )
+                                COMPACT_COMMAND_INSTANCE._perform_compaction(
+                                    model_override=_support_model
+                                )
+                                # Re-sync the local agent reference so the loop continues
+                                # with the freshly reloaded agent (history cleared, memory
+                                # summary already injected into its system prompt).
+                                from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
+                                _reloaded = _AM.get_active_agent()
+                                if _reloaded is not None:
+                                    agent = _reloaded
+                                # Queue the last user task to be replayed on the next
+                                # iteration so the agent continues without human input.
+                                _post_compact_input = (
+                                    _last_user_input
+                                    if _last_user_input.strip()
+                                    else "Continue the current task."
+                                )
+                                console.print(
+                                    "[bold green]✓ Memory summary applied to agent system prompt — "
+                                    "context window reset — continuing task[/bold green]\n"
+                                )
                 except (ValueError, Exception) as _e:
                     # Always show auto-compact errors so they are never silently lost.
                     console.print(f"[red]Auto-compact error: {_e}[/red]")

From 85324782e4f7603516522f93a149ecfc59e8a31c Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 15:38:41 -0600
Subject: [PATCH 05/10] fix: remove redundant message_history prepend causing
 2x token doubling

Both get_response (token counting) and _fetch_response (actual API call)
were prepending self.message_history to every request. But cli.py already
passes the full conversation history via history_context as conversation_input
to Runner.run, which threads it through as original_input to these methods.

Result: every historical message was sent TWICE in every API call, doubling
the effective context size. After auto-compact cleared message_history, the
duplication between runner-accumulated generated_items and message_history
rebuilt the doubled context within a single Runner.run invocation (after
just 3-4 tool calls), explaining why n_tokens never dropped post-compact.

Fix: remove the prepend loops. The runner's input parameter already contains
the full conversation (original_input + generated_items), so converted_messages
is built from input alone. message_history continues to serve its role as
cross-turn persistence (populated via add_to_message_history, consumed by
cli.py as history_context for the next Runner.run call).

Expected effect: halved token counts in normal operation; post-compact first
call starts at ~system_prompt + 1 user message and grows linearly with tool
calls rather than doubling.
---
 .../agents/models/openai_chatcompletions.py   | 35 ++++++-------------
 1 file changed, 10 insertions(+), 25 deletions(-)

diff --git a/src/cai/sdk/agents/models/openai_chatcompletions.py b/src/cai/sdk/agents/models/openai_chatcompletions.py
index 8931edd63..39af1e6e3 100644
--- a/src/cai/sdk/agents/models/openai_chatcompletions.py
+++ b/src/cai/sdk/agents/models/openai_chatcompletions.py
@@ -544,20 +544,12 @@ async def get_response(
             | {"base_url": str(self._get_client().base_url)},
             disabled=tracing.is_disabled(),
         ) as span_generation:
-            # Prepare the messages for consistent token counting
-            # IMPORTANT: Include existing message history for context
+            # Prepare the messages for consistent token counting.
+            # History is already included in `input` via cli.py's history_context mechanism
+            # (history_context = agent.model.message_history is passed as conversation_input
+            # to Runner.run, which then passes it as original_input to get_response).
+            # Prepending message_history here would double-count every message.
             converted_messages = []
-            
-            # First, add all existing messages from history
-            if self.message_history:
-                for msg in self.message_history:
-                    msg_copy = msg.copy()  # Use copy to avoid modifying original
-                    # Remove any existing cache_control to avoid exceeding the 4-block limit
-                    if "cache_control" in msg_copy:
-                        del msg_copy["cache_control"]
-                    converted_messages.append(msg_copy)
-            
-            # Then convert and add the new input
             new_messages = self._converter.items_to_messages(input, model_instance=self)
             converted_messages.extend(new_messages)
             
@@ -2545,19 +2537,12 @@ async def _fetch_response(
         # start by re-fetching self.is_ollama
         self.is_ollama = os.getenv("OLLAMA") is not None and os.getenv("OLLAMA").lower() == "true"
 
-        # IMPORTANT: Include existing message history for context
+        # Build the message list from `input` only.
+        # History is already included in `input` via cli.py's history_context mechanism:
+        # cli.py passes history_context (= message_history) as part of conversation_input
+        # to Runner.run, which passes it as original_input through to _fetch_response.
+        # Prepending message_history again would send every historical message twice.
         converted_messages = []
-        
-        # First, add all existing messages from history
-        if self.message_history:
-            for msg in self.message_history:
-                msg_copy = msg.copy()  # Use copy to avoid modifying original
-                # Remove any existing cache_control to avoid exceeding the 4-block limit
-                if "cache_control" in msg_copy:
-                    del msg_copy["cache_control"]
-                converted_messages.append(msg_copy)
-        
-        # Then convert and add the new input
         new_messages = self._converter.items_to_messages(input, model_instance=self)
         converted_messages.extend(new_messages)
 

From 6ee2f0d66b09e896244f2b7800589fcd0b3dbaae Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 16:09:22 -0600
Subject: [PATCH 06/10] fix: fire auto-compact inside Runner.run, not just at
 outer-loop level
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously CAI_SUPPORT_INTERVAL only checked at the end of each outer
while-loop iteration (once per user input). In agentic sessions the agent
makes many successive tool calls inside a single Runner.run invocation,
so the check would only fire after the runner returned — too late, or
never if the runner was still running.

Fix:
- Add ContextCompactedError exception class.
- Add a count-based trigger at the top of _auto_compact_if_needed (which
  fires on EVERY LLM API call, inside both get_response and stream_response).
  When assistant-message count >= CAI_SUPPORT_INTERVAL:
    1. Set the compact model to CAI_SUPPORT_MODEL temporarily.
    2. Summarise via _ai_summarize_history (awaited in-situ, no asyncio.run).
    3. Store summary in COMPACTED_SUMMARIES so get_system_prompt picks it
       up on the next turn without needing agent reload.
    4. Clear message_history + reset CAI_CONTEXT_USAGE.
    5. Raise ContextCompactedError to abort the current runner invocation.
- cli.py catches ContextCompactedError in both streaming and non-streaming
  runner call sites:
    - Sets _post_compact_input = _last_user_input so the task is replayed.
    - Re-syncs the local agent reference via AGENT_MANAGER.
    - Continues the outer while-loop, restarting with a clean context window.

The existing outer-loop CLI check (counting assistant messages in history
after the runner finishes) is kept as a belt-and-suspenders fallback.
---
 src/cai/cli.py                                | 26 +++++++
 .../agents/models/openai_chatcompletions.py   | 69 ++++++++++++++++++-
 2 files changed, 94 insertions(+), 1 deletion(-)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index 8515586c2..fc731b219 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -313,6 +313,7 @@ def suppress_aiohttp_warnings():
 from cai.sdk.agents.models.openai_chatcompletions import (
     get_agent_message_history,
     get_all_agent_histories,
+    ContextCompactedError,
 )
 # Import handled where needed to avoid circular imports
 from cai.sdk.agents.run_to_jsonl import get_session_recorder
@@ -1576,6 +1577,9 @@ async def process_streamed_response(agent, conversation_input):
                                 pass
                             
                             raise e
+                        except ContextCompactedError:
+                            # Propagate so the outer try block can handle the restart.
+                            raise
                         except Exception as e:
                             # Clean up on any other exception
                             if stream_iterator is not None:
@@ -1603,6 +1607,17 @@ async def process_streamed_response(agent, conversation_input):
 
                     try:
                         asyncio.run(process_streamed_response(agent, conversation_input))
+                    except ContextCompactedError:
+                        # Auto-compact fired mid-runner; restart with fresh context.
+                        _post_compact_input = _last_user_input or "Continue the current task."
+                        from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
+                        _reloaded = _AM.get_active_agent()
+                        if _reloaded is not None:
+                            agent = _reloaded
+                        console.print(
+                            "[bold green]✓ Context window reset — resuming task[/bold green]\n"
+                        )
+                        continue
                     except OutputGuardrailTripwireTriggered as e:
                         # Display a user-friendly warning instead of crashing (streaming mode)
                         guardrail_name = e.guardrail_result.guardrail.get_name()
@@ -1662,6 +1677,17 @@ async def process_streamed_response(agent, conversation_input):
                     # Use non-streamed response
                     try:
                         response = asyncio.run(Runner.run(agent, conversation_input))
+                    except ContextCompactedError:
+                        # Auto-compact fired mid-runner; restart with fresh context.
+                        _post_compact_input = _last_user_input or "Continue the current task."
+                        from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
+                        _reloaded = _AM.get_active_agent()
+                        if _reloaded is not None:
+                            agent = _reloaded
+                        console.print(
+                            "[bold green]✓ Context window reset — resuming task[/bold green]\n"
+                        )
+                        continue
                     except InputGuardrailTripwireTriggered as e:
                         # Display a user-friendly warning for input guardrails
                         reason = "Potential security threat detected in input"
diff --git a/src/cai/sdk/agents/models/openai_chatcompletions.py b/src/cai/sdk/agents/models/openai_chatcompletions.py
index 39af1e6e3..eac4cf306 100644
--- a/src/cai/sdk/agents/models/openai_chatcompletions.py
+++ b/src/cai/sdk/agents/models/openai_chatcompletions.py
@@ -363,6 +363,13 @@ def count_tokens_with_tiktoken(text_or_messages):
         return 0, 0
 
 
+class ContextCompactedError(Exception):
+    """Raised inside get_response/stream_response when a CAI_SUPPORT_INTERVAL-based
+    auto-compact fires mid-runner.  The outer CLI loop catches this, sets
+    _post_compact_input, and restarts the runner with a clean context window."""
+    pass
+
+
 class OpenAIChatCompletionsModel(Model):
     """OpenAI Chat Completions Model"""
 
@@ -3475,7 +3482,67 @@ async def _auto_compact_if_needed(self, estimated_tokens: int, input: str | list
         # Check if auto-compaction is disabled
         if os.getenv("CAI_AUTO_COMPACT", "true").lower() == "false":
             return input, system_instructions, False
-            
+
+        # --- CAI_SUPPORT_INTERVAL count-based trigger ---
+        # This fires on EVERY API call (not just at the outer CLI-loop level), so it correctly
+        # handles agentic sessions where the agent makes many tool calls inside one Runner.run.
+        _support_model = os.getenv("CAI_SUPPORT_MODEL")
+        _support_interval_raw = os.getenv("CAI_SUPPORT_INTERVAL")
+        if _support_model and _support_interval_raw:
+            try:
+                _support_interval = int(_support_interval_raw)
+                if _support_interval > 0:
+                    _asst_count = sum(
+                        1 for m in self.message_history
+                        if isinstance(m, dict) and m.get("role") == "assistant"
+                    )
+                    if _asst_count >= _support_interval:
+                        from rich.console import Console as _Console
+                        _console = _Console()
+                        _console.print(
+                            f"\n[bold yellow]⟳ Auto-compact: {_asst_count} LLM responses "
+                            f"(threshold {_support_interval}) — summarising with "
+                            f"{_support_model}[/bold yellow]"
+                        )
+                        try:
+                            from cai.repl.commands.memory import (
+                                MEMORY_COMMAND_INSTANCE,
+                                COMPACTED_SUMMARIES,
+                                APPLIED_MEMORY_IDS,
+                            )
+                            from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
+                            _orig_compact = COMPACT_COMMAND_INSTANCE.compact_model
+                            COMPACT_COMMAND_INSTANCE.compact_model = _support_model
+                            try:
+                                _summary = await MEMORY_COMMAND_INSTANCE._ai_summarize_history(
+                                    self.agent_name
+                                )
+                            finally:
+                                COMPACT_COMMAND_INSTANCE.compact_model = _orig_compact
+                            if _summary:
+                                if self.agent_name not in COMPACTED_SUMMARIES:
+                                    COMPACTED_SUMMARIES[self.agent_name] = []
+                                    APPLIED_MEMORY_IDS[self.agent_name] = []
+                                COMPACTED_SUMMARIES[self.agent_name] = [_summary]
+                                self.message_history.clear()
+                                os.environ["CAI_CONTEXT_USAGE"] = "0.0"
+                                _console.print(
+                                    "[bold green]✓ Memory summary applied — "
+                                    "context window reset — restarting task[/bold green]\n"
+                                )
+                        except Exception as _ce:
+                            _console.print(f"[red]Auto-compact error: {_ce}[/red]")
+                        # Always abort the current runner invocation so the outer loop
+                        # can restart with our freshly cleared context.
+                        raise ContextCompactedError(
+                            f"Context compacted after {_asst_count} LLM responses "
+                            f"(threshold {_support_interval})"
+                        )
+            except ContextCompactedError:
+                raise  # propagate to the outer runner / CLI loop
+            except (ValueError, Exception):
+                pass  # malformed interval — ignore silently
+
         max_tokens = self._get_model_max_tokens(str(self.model))
         threshold_percent = float(os.getenv("CAI_AUTO_COMPACT_THRESHOLD", "0.8"))
         threshold = max_tokens * threshold_percent

From 0235e9a2845f803213250367dd6325f7c084fe99 Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 16:21:36 -0600
Subject: [PATCH 07/10] fix: assign _last_user_input before Runner.run() to
 avoid UnboundLocalError on first compact

---
 src/cai/cli.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index fc731b219..c1694c7cd 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -1500,6 +1500,10 @@ async def process_parallel_responses():
                             {"role": "assistant", "content": f"{result.final_output}"}
                         )
             else:
+                # Capture user_input before runner calls so ContextCompactedError
+                # handlers can reference it even on the very first iteration.
+                _last_user_input = user_input if isinstance(user_input, str) else ""
+
                 # Disable streaming by default, unless specifically enabled
                 cai_stream = os.getenv("CAI_STREAM", "false")
                 # Handle empty string or None values
@@ -1757,9 +1761,6 @@ async def process_streamed_response(agent, conversation_input):
                 agent.model.message_history[:] = fix_message_list(agent.model.message_history)
             turn_count += 1
 
-            # Capture user_input here so auto-compact can replay it after clearing history.
-            _last_user_input = user_input if isinstance(user_input, str) else ""
-
             # Auto-compact: when CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL are both set,
             # compact the conversation every N LLM *responses* (assistant messages in
             # history) using the support model.  Counting assistant messages rather

From 476efb399454e2605fc77c4ec2e69c2a27966479 Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 16:57:19 -0600
Subject: [PATCH 08/10] fix: inject compacted summary back into message_history
 and sharpen compaction prompt
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

After compaction, message_history.clear() wiped all context and the stored
summary in COMPACTED_SUMMARIES was never re-injected, so the next runner turn
started completely blank and the agent would repeat already-tried approaches.

Fix: immediately after clearing, push a user+assistant exchange containing the
summary into message_history so it flows through history_context on the next
iteration as normal conversation context.

Also: add an explicit 'Exhausted Approaches — DO NOT RETRY' section (§9) and
'Recommended Next Steps' section (§10) to the compaction prompt so the support
model produces a checklist of dead ends the main agent must not revisit.
---
 src/cai/repl/commands/memory.py               |  4 +++-
 .../agents/models/openai_chatcompletions.py   | 22 +++++++++++++++++++
 2 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/src/cai/repl/commands/memory.py b/src/cai/repl/commands/memory.py
index 9f963d4e6..f2e283c8d 100644
--- a/src/cai/repl/commands/memory.py
+++ b/src/cai/repl/commands/memory.py
@@ -1221,7 +1221,8 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
 6. **All User Messages**: Complete list of user messages in order
 7. **Pending Tasks**: What still needs to be done
 8. **Current Work**: What was being worked on when the conversation ended
-9. **Optional Next Step**: If there's a clear next action, mention it
+9. **Exhausted Approaches — DO NOT RETRY**: Every technique, command, path, or attack vector that was attempted and failed. Format each as a bullet starting with ❌. Be specific (include exact commands, URLs, usernames, ports). This section is CRITICAL — the agent will use it to avoid wasting time on dead ends.
+10. **Recommended Next Steps**: Concrete actions NOT yet tried, ordered by likelihood of success.
 
 ## Important Guidelines
 
@@ -1232,6 +1233,7 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
 - Maintain technical accuracy - don't paraphrase technical terms
 - The summary will be used as the primary context for resuming work, so completeness is crucial
 - When the conversation is resumed, it should feel like a natural continuation
+- Section 9 (Exhausted Approaches) is the most important section for offensive/hacking tasks: list every failed attempt so the agent doesn't loop.
 
 This session is being continued from a previous conversation that ran out of context. The conversation is summarized below:"""
         
diff --git a/src/cai/sdk/agents/models/openai_chatcompletions.py b/src/cai/sdk/agents/models/openai_chatcompletions.py
index eac4cf306..4390f82d0 100644
--- a/src/cai/sdk/agents/models/openai_chatcompletions.py
+++ b/src/cai/sdk/agents/models/openai_chatcompletions.py
@@ -3525,6 +3525,28 @@ async def _auto_compact_if_needed(self, estimated_tokens: int, input: str | list
                                     APPLIED_MEMORY_IDS[self.agent_name] = []
                                 COMPACTED_SUMMARIES[self.agent_name] = [_summary]
                                 self.message_history.clear()
+                                # Re-inject the summary as the first exchange so
+                                # the next Runner turn has full context and won't
+                                # repeat work that was already attempted.
+                                self.message_history.append({
+                                    "role": "user",
+                                    "content": (
+                                        "<previous_session_memory>\n"
+                                        + _summary
+                                        + "\n</previous_session_memory>\n\n"
+                                        "This is your memory from the previous context window. "
+                                        "Use it to continue your work. "
+                                        "Do NOT retry any approach already marked as failed or exhausted."
+                                    ),
+                                })
+                                self.message_history.append({
+                                    "role": "assistant",
+                                    "content": (
+                                        "Understood. I have reviewed my previous session memory. "
+                                        "I will continue the task using only new approaches "
+                                        "and will not repeat anything already attempted."
+                                    ),
+                                })
                                 os.environ["CAI_CONTEXT_USAGE"] = "0.0"
                                 _console.print(
                                     "[bold green]✓ Memory summary applied — "

From 6546d8cbde4ae55c03d25025148b75c3583bf929 Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 17:23:13 -0600
Subject: [PATCH 09/10] fix: prevent post-compaction repeat work
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three root causes identified and fixed:

1. _format_history_for_summary dropped ALL tool outputs >500 chars with
   '[Long output truncated]', meaning nmap/gobuster/curl results — the
   exact evidence the summary model needs to write the 'Exhausted
   Approaches' section — were silently discarded. Increased limit to
   2000 chars (first chars of each output), bumped the message cap from
   50 to 200 blocks, and fixed the assistant tool-call extractor which
   only handled object-style tool_calls (not the dict-style format used
   in message_history), causing every command ever run to disappear.

2. _post_compact_input was set to the raw original user task (e.g.
   'hack the box machine Cap'). That becomes the last message the LLM
   reads, overriding the memory acknowledgement and making the agent
   treat it as a brand-new task. Now injects an explicit anti-repetition
   instruction alongside the original task text.

3. (Previous fix) Summary prompt now includes §9 Exhausted Approaches
   and §10 Recommended Next Steps — this only works if the summary model
   actually sees the scan/tool data, which fix #1 now guarantees.
---
 src/cai/cli.py                  | 22 ++++++++++--
 src/cai/repl/commands/memory.py | 63 ++++++++++++++++++++++-----------
 2 files changed, 62 insertions(+), 23 deletions(-)

diff --git a/src/cai/cli.py b/src/cai/cli.py
index c1694c7cd..fde1c5be7 100644
--- a/src/cai/cli.py
+++ b/src/cai/cli.py
@@ -1613,7 +1613,16 @@ async def process_streamed_response(agent, conversation_input):
                         asyncio.run(process_streamed_response(agent, conversation_input))
                     except ContextCompactedError:
                         # Auto-compact fired mid-runner; restart with fresh context.
-                        _post_compact_input = _last_user_input or "Continue the current task."
+                        _base = _last_user_input or "Continue the current task."
+                        _post_compact_input = (
+                            f"{_base}\n\n"
+                            "IMPORTANT: Your context window was just compacted. "
+                            "Your session memory is already loaded above. "
+                            "Review the 'Exhausted Approaches' section in your memory and "
+                            "DO NOT repeat any technique, command, URL, port scan, or login "
+                            "attempt already listed there. "
+                            "Pick up exactly where you left off using only NEW approaches."
+                        )
                         from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
                         _reloaded = _AM.get_active_agent()
                         if _reloaded is not None:
@@ -1683,7 +1692,16 @@ async def process_streamed_response(agent, conversation_input):
                         response = asyncio.run(Runner.run(agent, conversation_input))
                     except ContextCompactedError:
                         # Auto-compact fired mid-runner; restart with fresh context.
-                        _post_compact_input = _last_user_input or "Continue the current task."
+                        _base = _last_user_input or "Continue the current task."
+                        _post_compact_input = (
+                            f"{_base}\n\n"
+                            "IMPORTANT: Your context window was just compacted. "
+                            "Your session memory is already loaded above. "
+                            "Review the 'Exhausted Approaches' section in your memory and "
+                            "DO NOT repeat any technique, command, URL, port scan, or login "
+                            "attempt already listed there. "
+                            "Pick up exactly where you left off using only NEW approaches."
+                        )
                         from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
                         _reloaded = _AM.get_active_agent()
                         if _reloaded is not None:
diff --git a/src/cai/repl/commands/memory.py b/src/cai/repl/commands/memory.py
index f2e283c8d..0e1af02bc 100644
--- a/src/cai/repl/commands/memory.py
+++ b/src/cai/repl/commands/memory.py
@@ -1267,39 +1267,60 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
             return None
     
     def _format_history_for_summary(self, history: List[Dict[str, Any]]) -> str:
-        """Format message history for summarization."""
+        """Format message history for summarization.
+
+        Critical design goals:
+        - Include EVERY tool call with its exact arguments (commands run, URLs visited,
+          ports scanned) so the summary model can produce an "Exhausted Approaches" list.
+        - Include enough of each tool result to convey success/failure and key findings.
+        - Avoid blowing out the summary model's context by capping large outputs.
+        """
+        TOOL_OUTPUT_KEEP = 2000   # chars to preserve from each tool result
+        MAX_PARTS       = 200     # maximum formatted blocks to pass (covers ~100 turns)
+
         formatted_parts = []
-        
+
         for msg in history:
             role = msg.get("role", "unknown")
             content = msg.get("content", "")
-            
-            # Skip empty messages
-            if not content:
-                continue
-                
-            # Format based on role
+
             if role == "user":
-                formatted_parts.append(f"USER: {content}")
+                if content:
+                    formatted_parts.append(f"USER: {content}")
+
             elif role == "assistant":
-                # Check for tool calls
-                if "tool_calls" in msg and msg["tool_calls"]:
+                # -- tool calls: extract args from both dict-style and object-style entries --
+                tool_calls = msg.get("tool_calls") or []
+                if tool_calls:
                     tool_info = []
-                    for tc in msg["tool_calls"]:
-                        if hasattr(tc, "function"):
-                            tool_info.append(f"{tc.function.name}({tc.function.arguments})")
+                    for tc in tool_calls:
+                        if isinstance(tc, dict):
+                            fn = tc.get("function", {})
+                            name = fn.get("name", "?")
+                            args = fn.get("arguments", "")
+                            tool_info.append(f"{name}({args})")
+                        elif hasattr(tc, "function"):
+                            tool_info.append(
+                                f"{tc.function.name}({tc.function.arguments})"
+                            )
                     if tool_info:
-                        formatted_parts.append(f"ASSISTANT (tools): {', '.join(tool_info)}")
+                        formatted_parts.append(
+                            f"ASSISTANT called tools: {', '.join(tool_info)}"
+                        )
                 if content:
                     formatted_parts.append(f"ASSISTANT: {content}")
+
             elif role == "tool":
-                # Include important tool outputs
-                if len(str(content)) < 500:  # Only include short outputs
-                    formatted_parts.append(f"TOOL OUTPUT: {content}")
+                raw = str(content) if content else ""
+                if len(raw) <= TOOL_OUTPUT_KEEP:
+                    formatted_parts.append(f"TOOL OUTPUT:\n{raw}")
                 else:
-                    formatted_parts.append(f"TOOL OUTPUT: [Long output truncated]")
-                    
-        return "\n\n".join(formatted_parts[-50:])  # Limit to last 50 exchanges
+                    head = raw[:TOOL_OUTPUT_KEEP]
+                    formatted_parts.append(
+                        f"TOOL OUTPUT (truncated to {TOOL_OUTPUT_KEEP} chars):\n{head}\n[...truncated]"
+                    )
+
+        return "\n\n".join(formatted_parts[-MAX_PARTS:])
     
     def _get_current_agent_name(self) -> Optional[str]:
         """Get the name of the current active agent."""

From 380a4a708b3f88363f37c385a8eb7a79179ead2e Mon Sep 17 00:00:00 2001
From: giveen <ajaxx20020@gmail.com>
Date: Thu, 2 Apr 2026 19:04:03 -0600
Subject: [PATCH 10/10] support: flush support/summarizer model after
 summarization (cleanup + call)

---
 src/cai/repl/commands/memory.py               | 21 ++++++++-
 .../agents/models/openai_chatcompletions.py   | 43 +++++++++++++++++++
 2 files changed, 62 insertions(+), 2 deletions(-)

diff --git a/src/cai/repl/commands/memory.py b/src/cai/repl/commands/memory.py
index 0e1af02bc..e0523a36a 100644
--- a/src/cai/repl/commands/memory.py
+++ b/src/cai/repl/commands/memory.py
@@ -4,6 +4,7 @@
 """
 
 from typing import List, Optional, Dict, Any
+import inspect
 import os
 import asyncio
 import json
@@ -1256,15 +1257,31 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
                 input=f"Please summarize the following conversation:\n\n{conversation_text}",
                 max_turns=1
             )
-            
+
             if result.final_output:
                 return str(result.final_output)
             else:
                 return None
-                
+
         except Exception as e:
             console.print(f"[red]Error generating summary: {e}[/red]")
             return None
+        finally:
+            # Best-effort: explicitly cleanup the temporary summary/support model
+            try:
+                model_inst = getattr(summary_agent, "model", None)
+                # Some Agent constructions put the Model object directly on `agent.model`
+                # and some providers expose a cleanup coroutine.
+                if model_inst is not None and hasattr(model_inst, "cleanup"):
+                    try:
+                        coro = model_inst.cleanup()
+                        if inspect.isawaitable(coro):
+                            await coro
+                    except Exception:
+                        # best-effort cleanup — swallow any errors
+                        pass
+            except Exception:
+                pass
     
     def _format_history_for_summary(self, history: List[Dict[str, Any]]) -> str:
         """Format message history for summarization.
diff --git a/src/cai/sdk/agents/models/openai_chatcompletions.py b/src/cai/sdk/agents/models/openai_chatcompletions.py
index 4390f82d0..324b0fdaa 100644
--- a/src/cai/sdk/agents/models/openai_chatcompletions.py
+++ b/src/cai/sdk/agents/models/openai_chatcompletions.py
@@ -470,6 +470,49 @@ def __del__(self):
             # Ignore any errors during cleanup
             pass
 
+    async def cleanup(self) -> None:
+        """Explicitly cleanup underlying clients and free instance registry.
+
+        This is intended to be called when a temporary model instance (for
+        example the summary/support model) is no longer needed. It will try
+        to close the HTTP/async client if available, remove the instance
+        from the legacy `ACTIVE_MODEL_INSTANCES` registry and clear the
+        in-memory message history so any backing LLM server can free slots
+        or context.
+        """
+        try:
+            client = getattr(self, "_client", None)
+            if client is not None:
+                aclose = getattr(client, "aclose", None)
+                if aclose:
+                    try:
+                        res = aclose()
+                        # Await if it's awaitable
+                        if inspect.isawaitable(res):
+                            await res
+                    except Exception:
+                        # Best-effort close
+                        pass
+                try:
+                    delattr(self, "_client")
+                except Exception:
+                    pass
+        except Exception:
+            pass
+
+        try:
+            key = (getattr(self, '_display_name', None), getattr(self, 'agent_id', None))
+            if key in ACTIVE_MODEL_INSTANCES:
+                del ACTIVE_MODEL_INSTANCES[key]
+        except Exception:
+            pass
+
+        try:
+            if hasattr(self, 'message_history') and isinstance(self.message_history, list):
+                self.message_history.clear()
+        except Exception:
+            pass
+
     def add_to_message_history(self, msg):
         """Add a message to this instance's history if it's not a duplicate.