Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
125 changes: 125 additions & 0 deletions src/cai/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ def suppress_aiohttp_warnings():
from cai.sdk.agents.models.openai_chatcompletions import (
get_agent_message_history,
get_all_agent_histories,
ContextCompactedError,
)
# Import handled where needed to avoid circular imports
from cai.sdk.agents.run_to_jsonl import get_session_recorder
Expand Down Expand Up @@ -442,6 +443,9 @@ def run_cai_cli(
agent = starting_agent
turn_count = 0
idle_time = 0
# Holds a user message to replay on the next iteration without prompting
# the user — set by auto-compact so the agent continues its current task.
_post_compact_input: str | None = None
console = Console()
last_model = os.getenv("CAI_MODEL", "alias1")
last_agent_type = os.getenv("CAI_AGENT_TYPE", "one_tool_agent")
Expand Down Expand Up @@ -482,6 +486,18 @@ def run_cai_cli(
print("\n")
display_quick_guide(console)

# Notify user if auto-compact is active so they can confirm the vars loaded.
_sc_model_startup = os.getenv("CAI_SUPPORT_MODEL")
_sc_interval_startup = os.getenv("CAI_SUPPORT_INTERVAL")
if _sc_model_startup and _sc_interval_startup:
try:
console.print(
f"[bold cyan]🗜 Auto-compact enabled: every {int(_sc_interval_startup)} LLM responses "
f"using {_sc_model_startup}[/bold cyan]"
)
except ValueError:
pass

# Function to get the short name of the agent for display
def get_agent_short_name(agent):
if hasattr(agent, "name"):
Expand Down Expand Up @@ -690,6 +706,11 @@ def get_agent_short_name(agent):
if use_initial_prompt:
user_input = initial_prompt
use_initial_prompt = False # Only use it once
elif _post_compact_input is not None:
# Auto-compact just ran — replay the last task so the agent
# continues working without waiting for human input.
user_input = _post_compact_input
_post_compact_input = None
else:
# Get user input with command completion and history
user_input = get_user_input(
Expand Down Expand Up @@ -1479,6 +1500,10 @@ async def process_parallel_responses():
{"role": "assistant", "content": f"{result.final_output}"}
)
else:
# Capture user_input before runner calls so ContextCompactedError
# handlers can reference it even on the very first iteration.
_last_user_input = user_input if isinstance(user_input, str) else ""

# Disable streaming by default, unless specifically enabled
cai_stream = os.getenv("CAI_STREAM", "false")
# Handle empty string or None values
Expand Down Expand Up @@ -1556,6 +1581,9 @@ async def process_streamed_response(agent, conversation_input):
pass

raise e
except ContextCompactedError:
# Propagate so the outer try block can handle the restart.
raise
except Exception as e:
# Clean up on any other exception
if stream_iterator is not None:
Expand Down Expand Up @@ -1583,6 +1611,26 @@ async def process_streamed_response(agent, conversation_input):

try:
asyncio.run(process_streamed_response(agent, conversation_input))
except ContextCompactedError:
# Auto-compact fired mid-runner; restart with fresh context.
_base = _last_user_input or "Continue the current task."
_post_compact_input = (
f"{_base}\n\n"
"IMPORTANT: Your context window was just compacted. "
"Your session memory is already loaded above. "
"Review the 'Exhausted Approaches' section in your memory and "
"DO NOT repeat any technique, command, URL, port scan, or login "
"attempt already listed there. "
"Pick up exactly where you left off using only NEW approaches."
)
from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
_reloaded = _AM.get_active_agent()
if _reloaded is not None:
agent = _reloaded
console.print(
"[bold green]✓ Context window reset — resuming task[/bold green]\n"
)
continue
except OutputGuardrailTripwireTriggered as e:
# Display a user-friendly warning instead of crashing (streaming mode)
guardrail_name = e.guardrail_result.guardrail.get_name()
Expand Down Expand Up @@ -1642,6 +1690,26 @@ async def process_streamed_response(agent, conversation_input):
# Use non-streamed response
try:
response = asyncio.run(Runner.run(agent, conversation_input))
except ContextCompactedError:
# Auto-compact fired mid-runner; restart with fresh context.
_base = _last_user_input or "Continue the current task."
_post_compact_input = (
f"{_base}\n\n"
"IMPORTANT: Your context window was just compacted. "
"Your session memory is already loaded above. "
"Review the 'Exhausted Approaches' section in your memory and "
"DO NOT repeat any technique, command, URL, port scan, or login "
"attempt already listed there. "
"Pick up exactly where you left off using only NEW approaches."
)
from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
_reloaded = _AM.get_active_agent()
if _reloaded is not None:
agent = _reloaded
console.print(
"[bold green]✓ Context window reset — resuming task[/bold green]\n"
)
continue
except InputGuardrailTripwireTriggered as e:
# Display a user-friendly warning for input guardrails
reason = "Potential security threat detected in input"
Expand Down Expand Up @@ -1711,6 +1779,63 @@ async def process_streamed_response(agent, conversation_input):
agent.model.message_history[:] = fix_message_list(agent.model.message_history)
turn_count += 1

# Auto-compact: when CAI_SUPPORT_MODEL + CAI_SUPPORT_INTERVAL are both set,
# compact the conversation every N LLM *responses* (assistant messages in
# history) using the support model. Counting assistant messages rather
# than outer-loop turns means agentic sessions — where the agent makes
# many tool-call rounds per single user input — are handled correctly.
_support_model = os.getenv("CAI_SUPPORT_MODEL")
_support_interval_raw = os.getenv("CAI_SUPPORT_INTERVAL")
if _support_model and _support_interval_raw:
Copy link

Copilot AI Apr 3, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This CLI-level CAI_SUPPORT_INTERVAL auto-compact runs whenever CAI_SUPPORT_MODEL/CAI_SUPPORT_INTERVAL are set, but it does not check CAI_AUTO_COMPACT. As a result, setting CAI_AUTO_COMPACT=false will still trigger count-based compaction from the CLI loop. Consider honoring the same disable flag here (or introducing a dedicated flag for support-interval compaction) so users can reliably turn auto-compaction off.

Suggested change
if _support_model and _support_interval_raw:
_auto_compact_raw = os.getenv("CAI_AUTO_COMPACT", "true")
_auto_compact_enabled = _auto_compact_raw.strip().lower() not in {
"0", "false", "no", "off"
}
if _auto_compact_enabled and _support_model and _support_interval_raw:

Copilot uses AI. Check for mistakes.
try:
_support_interval = int(_support_interval_raw)
if _support_interval > 0:
# Count assistant messages as a proxy for LLM API calls.
_history = getattr(getattr(agent, 'model', None), 'message_history', [])
_llm_call_count = sum(
1 for m in _history
if (m.get("role") if isinstance(m, dict) else getattr(m, "role", None))
== "assistant"
)
if _llm_call_count > 0:
_calls_until = max(0, _support_interval - _llm_call_count)
if _calls_until > 0:
console.print(
f"[dim cyan] ↻ auto-compact in {_calls_until} LLM response(s) "
f"[{_llm_call_count}/{_support_interval}][/dim cyan]"
)
if _llm_call_count >= _support_interval:
from cai.repl.commands.compact import COMPACT_COMMAND_INSTANCE
console.print(
f"\n[bold yellow]⟳ Auto-compact: {_llm_call_count} LLM responses "
f"(threshold {_support_interval}) — "
f"summarising with {_support_model}[/bold yellow]"
)
COMPACT_COMMAND_INSTANCE._perform_compaction(
model_override=_support_model
)
# Re-sync the local agent reference so the loop continues
# with the freshly reloaded agent (history cleared, memory
# summary already injected into its system prompt).
from cai.sdk.agents.simple_agent_manager import AGENT_MANAGER as _AM
_reloaded = _AM.get_active_agent()
if _reloaded is not None:
agent = _reloaded
# Queue the last user task to be replayed on the next
# iteration so the agent continues without human input.
_post_compact_input = (
_last_user_input
if _last_user_input.strip()
else "Continue the current task."
)
console.print(
"[bold green]✓ Memory summary applied to agent system prompt — "
"context window reset — continuing task[/bold green]\n"
)
except (ValueError, Exception) as _e:
# Always show auto-compact errors so they are never silently lost.
console.print(f"[red]Auto-compact error: {_e}[/red]")

# Stop measuring active time and start measuring idle time again
stop_active_timer()
start_idle_timer()
Expand Down
88 changes: 64 additions & 24 deletions src/cai/repl/commands/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@
"""

from typing import List, Optional, Dict, Any
import inspect
import os
import asyncio
import json
Expand Down Expand Up @@ -1221,7 +1222,8 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
6. **All User Messages**: Complete list of user messages in order
7. **Pending Tasks**: What still needs to be done
8. **Current Work**: What was being worked on when the conversation ended
9. **Optional Next Step**: If there's a clear next action, mention it
9. **Exhausted Approaches — DO NOT RETRY**: Every technique, command, path, or attack vector that was attempted and failed. Format each as a bullet starting with ❌. Be specific (include exact commands, URLs, usernames, ports). This section is CRITICAL — the agent will use it to avoid wasting time on dead ends.
10. **Recommended Next Steps**: Concrete actions NOT yet tried, ordered by likelihood of success.

## Important Guidelines

Expand All @@ -1232,6 +1234,7 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
- Maintain technical accuracy - don't paraphrase technical terms
- The summary will be used as the primary context for resuming work, so completeness is crucial
- When the conversation is resumed, it should feel like a natural continuation
- Section 9 (Exhausted Approaches) is the most important section for offensive/hacking tasks: list every failed attempt so the agent doesn't loop.

This session is being continued from a previous conversation that ran out of context. The conversation is summarized below:"""

Expand All @@ -1254,50 +1257,87 @@ async def _ai_summarize_history(self, agent_name: Optional[str] = None) -> Optio
input=f"Please summarize the following conversation:\n\n{conversation_text}",
max_turns=1
)

if result.final_output:
return str(result.final_output)
else:
return None

except Exception as e:
console.print(f"[red]Error generating summary: {e}[/red]")
return None
finally:
# Best-effort: explicitly cleanup the temporary summary/support model
try:
model_inst = getattr(summary_agent, "model", None)
# Some Agent constructions put the Model object directly on `agent.model`
# and some providers expose a cleanup coroutine.
if model_inst is not None and hasattr(model_inst, "cleanup"):
try:
coro = model_inst.cleanup()
if inspect.isawaitable(coro):
await coro
except Exception:
# best-effort cleanup — swallow any errors
pass
except Exception:
pass

def _format_history_for_summary(self, history: List[Dict[str, Any]]) -> str:
"""Format message history for summarization."""
"""Format message history for summarization.

Critical design goals:
- Include EVERY tool call with its exact arguments (commands run, URLs visited,
ports scanned) so the summary model can produce an "Exhausted Approaches" list.
- Include enough of each tool result to convey success/failure and key findings.
- Avoid blowing out the summary model's context by capping large outputs.
"""
TOOL_OUTPUT_KEEP = 2000 # chars to preserve from each tool result
MAX_PARTS = 200 # maximum formatted blocks to pass (covers ~100 turns)

formatted_parts = []

for msg in history:
role = msg.get("role", "unknown")
content = msg.get("content", "")

# Skip empty messages
if not content:
continue

# Format based on role

if role == "user":
formatted_parts.append(f"USER: {content}")
if content:
formatted_parts.append(f"USER: {content}")

elif role == "assistant":
# Check for tool calls
if "tool_calls" in msg and msg["tool_calls"]:
# -- tool calls: extract args from both dict-style and object-style entries --
tool_calls = msg.get("tool_calls") or []
if tool_calls:
tool_info = []
for tc in msg["tool_calls"]:
if hasattr(tc, "function"):
tool_info.append(f"{tc.function.name}({tc.function.arguments})")
for tc in tool_calls:
if isinstance(tc, dict):
fn = tc.get("function", {})
name = fn.get("name", "?")
args = fn.get("arguments", "")
tool_info.append(f"{name}({args})")
elif hasattr(tc, "function"):
tool_info.append(
f"{tc.function.name}({tc.function.arguments})"
)
if tool_info:
formatted_parts.append(f"ASSISTANT (tools): {', '.join(tool_info)}")
formatted_parts.append(
f"ASSISTANT called tools: {', '.join(tool_info)}"
)
if content:
formatted_parts.append(f"ASSISTANT: {content}")

elif role == "tool":
# Include important tool outputs
if len(str(content)) < 500: # Only include short outputs
formatted_parts.append(f"TOOL OUTPUT: {content}")
raw = str(content) if content else ""
if len(raw) <= TOOL_OUTPUT_KEEP:
formatted_parts.append(f"TOOL OUTPUT:\n{raw}")
else:
formatted_parts.append(f"TOOL OUTPUT: [Long output truncated]")

return "\n\n".join(formatted_parts[-50:]) # Limit to last 50 exchanges
head = raw[:TOOL_OUTPUT_KEEP]
formatted_parts.append(
f"TOOL OUTPUT (truncated to {TOOL_OUTPUT_KEEP} chars):\n{head}\n[...truncated]"
)

return "\n\n".join(formatted_parts[-MAX_PARTS:])

def _get_current_agent_name(self) -> Optional[str]:
"""Get the name of the current active agent."""
Expand Down
Loading
Loading