claw-web-v2

Sleeping

Claw Web

CRITICAL FIX: SendUserMessage no longer breaks agent loop + match original behavior

5d6ab9e about 1 month ago

53.9 kB

	/**
	* Claw Agent Runtime — the core agentic conversation loop.
	* Handles streaming responses, tool calls, and multi-turn conversations.
	*/

	import { ENV } from "../_core/env";
	import { buildSystemPrompt, TOOL_DEFINITIONS } from "./system-prompt";
	import { executeTool, getPlanMode, runPreToolHooks, runPostToolHooks, initializeMcpFromConfig, getMcpManager } from "../tools/executor";
	import { compactSession, compactSessionWithLLM, shouldCompact, estimateSessionTokens, dbMessagesToSession, DEFAULT_COMPACTION_CONFIG } from "./compact";
	import type { Session, ConversationMessage as CompactMessage, CompactionConfig } from "./compact";
	import { UsageTracker, pricingForModel, defaultSonnetTierPricing, estimateCostUsdWithPricing, totalCostUsd, formatUsd, summaryLinesForModel } from "./usage";
	import type { TokenUsage } from "./usage";
	import type { Response } from "express";
	import { execSync } from "child_process";

	// In original claw-code, max_iterations defaults to usize::MAX (effectively unlimited).
	// Auto-compact is triggered on context overflow (400 error) — matches original compact() method.

	// Context window sizes for known models (used for proactive compaction)
	const MODEL_CONTEXT_WINDOWS: Record<string, number> = {
	// Xiaomi MiMo
	"XiaomiMiMo/MiMo-V2-Flash": 262144,
	// Qwen models (DeepInfra + HuggingFace)
	"Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo": 262144,
	"Qwen/Qwen3-Coder-480B-A35B-Instruct": 262144,
	"Qwen/Qwen3-235B-A22B-Instruct-2507": 262144,
	"Qwen/Qwen3-235B-A22B-Thinking-2507": 262144,
	"Qwen/Qwen3.5-397B-A17B": 262144,
	"Qwen/Qwen3.5-122B-A10B": 262144,
	"Qwen/Qwen3-Coder-Next": 131072,
	"Qwen/Qwen3-32B": 40960,
	"Qwen/Qwen3-8B": 32768,
	"Qwen/Qwen3-Coder-30B-A3B-Instruct": 131072,
	// Meta Llama
	"meta-llama/Llama-3.3-70B-Instruct": 131072,
	"meta-llama/Llama-4-Maverick-17B-128E": 1048576,
	"meta-llama/Llama-4-Scout-17B-16E": 327680,
	// DeepSeek
	"deepseek-ai/DeepSeek-V3.2": 163840,
	"deepseek-ai/DeepSeek-V3.1": 163840,
	"deepseek-ai/DeepSeek-R1": 131072,
	"deepseek-ai/DeepSeek-R1-0528": 163840,
	// NVIDIA Nemotron
	"nvidia/NVIDIA-Nemotron-3-Super-120B-A12B": 262144,
	// StepFun
	"stepfun-ai/Step-3.5-Flash": 262144,
	// NousResearch (uncensored)
	"NousResearch/Hermes-3-Llama-3.1-70B": 131072,
	"NousResearch/Hermes-3-Llama-3.1-405B": 131072,
	// Anthropic
	"claude-opus-4-6": 200000,
	"claude-sonnet-4-6": 200000,
	// OpenAI
	"gpt-5.4": 1048576,
	"gpt-4.1": 1048576,
	// xAI
	"grok-3": 131072,
	// Google
	"google/gemini-2.5-flash": 1000000,
	"google/gemini-2.5-pro": 1000000,
	};

	const DEFAULT_CONTEXT_WINDOW = 131072;

	/**
	* Convert agent messages to compact.ts Session format for compaction.
	*/
	function agentMessagesToSession(messages: AgentMessage[]): Session {
	return dbMessagesToSession(
	messages.map((m) => ({
	role: m.role,
	content: m.content \|\| "",
	toolName: m.name \|\| null,
	toolCallId: m.tool_call_id \|\| null,
	}))
	);
	}

	/**
	* Convert compacted Session back to AgentMessage[] format.
	*/
	function sessionToAgentMessages(session: Session): AgentMessage[] {
	return session.messages.map((msg) => {
	const agentMsg: AgentMessage = {
	role: msg.role,
	content: msg.blocks
	.filter((b) => b.type === "text")
	.map((b) => b.text \|\| "")
	.join("\n") \|\| null,
	};
	// Reconstruct tool_calls from tool_use blocks
	const toolUseBlocks = msg.blocks.filter((b) => b.type === "tool_use");
	if (toolUseBlocks.length > 0) {
	agentMsg.tool_calls = toolUseBlocks.map((b, i) => ({
	id: `compacted_${i}_${Date.now()}`,
	type: "function" as const,
	function: {
	name: b.name \|\| "unknown",
	arguments: b.input \|\| "{}",
	},
	}));
	}
	// Reconstruct tool result fields
	const toolResultBlock = msg.blocks.find((b) => b.type === "tool_result");
	if (toolResultBlock) {
	agentMsg.name = toolResultBlock.toolName;
	agentMsg.content = toolResultBlock.output \|\| "";
	}
	return agentMsg;
	});
	}

	/**
	* Estimate total tokens in the conversation (simple heuristic: ~4 chars per token).
	*/
	function estimateConversationTokens(messages: AgentMessage[]): number {
	let total = 0;
	for (const msg of messages) {
	total += Math.ceil((msg.content?.length \|\| 0) / 4) + 4; // +4 for role/overhead
	if (msg.tool_calls) {
	for (const tc of msg.tool_calls) {
	total += Math.ceil((tc.function.name.length + tc.function.arguments.length) / 4) + 4;
	}
	}
	}
	return total;
	}

	interface AgentMessage {
	role: "system" \| "user" \| "assistant" \| "tool";
	content: string \| null;
	tool_calls?: Array<{
	id: string;
	type: "function";
	function: { name: string; arguments: string };
	}>;
	tool_call_id?: string;
	name?: string;
	}

	interface AgentConfig {
	model: string;
	apiProvider: string;
	apiKey?: string \| null;
	apiBaseUrl?: string \| null;
	maxTokens: number;
	temperature: number;
	topP: number;
	systemPrompt?: string \| null;
	memory?: string \| null;
	workDir?: string;
	effortLevel?: "low" \| "medium" \| "high";
	maxIterations?: number;
	}

	/**
	* TurnSummary — matches original conversation.rs TurnSummary struct.
	* Returned after each complete agent turn.
	*/
	export interface TurnSummary {
	assistantMessages: AgentMessage[];
	toolResults: AgentMessage[];
	iterations: number;
	usage: TokenUsage;
	}

	/**
	* Read git status (matches original read_git_status from prompt.rs)
	*/
	function readGitStatus(cwd: string): string \| null {
	try {
	const output = execSync("git --no-optional-locks status --short --branch", {
	cwd,
	timeout: 5000,
	encoding: "utf-8",
	stdio: ["pipe", "pipe", "pipe"],
	}).trim();
	return output \|\| null;
	} catch {
	return null;
	}
	}

	/**
	* Read git diff (matches original read_git_diff from prompt.rs)
	*/
	function readGitDiff(cwd: string): string \| null {
	try {
	const sections: string[] = [];
	try {
	const staged = execSync("git diff --cached", {
	cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"],
	}).trim();
	if (staged) sections.push(`Staged changes:\n${staged}`);
	} catch {}
	try {
	const unstaged = execSync("git diff", {
	cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"],
	}).trim();
	if (unstaged) sections.push(`Unstaged changes:\n${unstaged}`);
	} catch {}
	return sections.length > 0 ? sections.join("\n\n") : null;
	} catch {
	return null;
	}
	}

	/**
	* Merge hook feedback into tool output — matches original merge_hook_feedback()
	*/
	function mergeHookFeedback(hookMessages: string[], output: string, denied: boolean): string {
	if (hookMessages.length === 0) return output;
	const sections: string[] = [];
	if (output.trim()) sections.push(output);
	const label = denied ? "Hook feedback (denied)" : "Hook feedback";
	sections.push(`${label}:\n${hookMessages.join("\n")}`);
	return sections.join("\n\n");
	}

	const DEFAULT_CONFIG: AgentConfig = {
	model: process.env.DEFAULT_MODEL \|\| "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
	apiProvider: "deepinfra",
	maxTokens: 32768, // Qwen3-Coder supports up to 65k output
	temperature: 0.5, // Lower temp = more focused/deterministic agent behavior
	topP: 0.95, // Slightly restricted for more coherent tool calls
	workDir: process.env.WORKSPACE_DIR \|\| "/home/ubuntu",
	effortLevel: "high",
	};

	/**
	* Retry config for transient API errors.
	* - 429 (rate limit): retry INFINITELY every 2 seconds until it works.
	* - 500/502/503 (server errors): retry INFINITELY every 2 seconds.
	* - Network errors: retry INFINITELY every 2 seconds.
	* We NEVER give up on transient errors — just keep trying.
	*/
	const RETRY_DELAY_MS = 2000; // fixed 2 second interval — simple and reliable

	/**
	* Resolve the API URL and key based on provider config
	*/
	function resolveApiConfig(config: AgentConfig) {
	// ─── HARDCODED FALLBACK — always works even if settings are corrupted ───
	const FALLBACK_URL = "https://api.deepinfra.com/v1/openai";
	const FALLBACK_MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo";

	// Resolve model aliases (used for both default and custom paths)
	const aliasMap: Record<string, string> = {
	// Xiaomi MiMo
	mimo: "XiaomiMiMo/MiMo-V2-Flash",
	"mimo-flash": "XiaomiMiMo/MiMo-V2-Flash",
	"mimo-v2": "XiaomiMiMo/MiMo-V2-Flash",
	// Qwen models (DeepInfra)
	"qwen-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
	"qwen-coder-turbo": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
	"qwen-coder-480b": "Qwen/Qwen3-Coder-480B-A35B-Instruct",
	"qwen3-235b": "Qwen/Qwen3-235B-A22B-Instruct-2507",
	"qwen3-thinking": "Qwen/Qwen3-235B-A22B-Thinking-2507",
	"qwen3.5": "Qwen/Qwen3.5-397B-A17B",
	"qwen3-32b": "Qwen/Qwen3-32B",
	"qwen3-8b": "Qwen/Qwen3-8B",
	"qwen3-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo",
	// Llama
	llama: "meta-llama/Llama-3.3-70B-Instruct",
	"llama-70b": "meta-llama/Llama-3.3-70B-Instruct",
	"llama-4": "meta-llama/Llama-4-Maverick-17B-128E",
	// DeepSeek
	deepseek: "deepseek-ai/DeepSeek-V3.2",
	"deepseek-r1": "deepseek-ai/DeepSeek-R1-0528",
	"deepseek-v3": "deepseek-ai/DeepSeek-V3.2",
	// NVIDIA
	nemotron: "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B",
	// StepFun
	step: "stepfun-ai/Step-3.5-Flash",
	"step-flash": "stepfun-ai/Step-3.5-Flash",
	// Uncensored
	hermes: "NousResearch/Hermes-3-Llama-3.1-70B",
	"hermes-405b": "NousResearch/Hermes-3-Llama-3.1-405B",
	uncensored: "NousResearch/Hermes-3-Llama-3.1-70B",
	// OpenAI GPT-5.x family
	"gpt5": "gpt-5.4",
	"gpt-5": "gpt-5.4",
	"gpt54": "gpt-5.4",
	// Anthropic aliases
	opus: "claude-opus-4-6",
	sonnet: "claude-sonnet-4-6",
	haiku: "claude-haiku-4-5-20251213",
	// xAI
	grok: "grok-3",
	"grok-3": "grok-3",
	// Google
	gemini: "google/gemini-2.5-flash",
	"gemini-pro": "google/gemini-2.5-pro",
	};

	// Treat empty, null, masked, or built-in providers as "use server default"
	const hasCustomKey = config.apiKey && config.apiKey.length > 4 && !config.apiKey.startsWith("••••");
	if (config.apiProvider === "claw" \|\| config.apiProvider === "default" \|\| config.apiProvider === "huggingface" \|\| config.apiProvider === "deepinfra" \|\| !hasCustomKey) {
	const defaultModel = process.env.DEFAULT_MODEL \|\| FALLBACK_MODEL;
	const resolvedModel = aliasMap[config.model] \|\| config.model \|\| defaultModel;
	// Use BUILT_IN_FORGE_API_URL from env — HuggingFace router or OpenAI
	const baseUrl = (ENV.forgeApiUrl \|\| FALLBACK_URL).replace(/\/$/, "");
	const apiKey = ENV.forgeApiKey \|\| process.env.BUILT_IN_FORGE_API_KEY \|\| "";
	console.log(`[agent] resolveApiConfig: using server default. URL=${baseUrl}, model=${resolvedModel}, hasKey=${!!apiKey}`);
	return {
	url: `${baseUrl}/chat/completions`,
	key: apiKey,
	model: resolvedModel \|\| FALLBACK_MODEL,
	};
	}

	// Custom provider path — user has their own API key
	let baseUrl = config.apiBaseUrl \|\| "";
	if (!baseUrl) {
	const providers: Record<string, string> = {
	deepinfra: "https://api.deepinfra.com/v1/openai",
	huggingface: "https://router.huggingface.co/v1",
	xai: "https://api.x.ai/v1",
	openrouter: "https://openrouter.ai/api/v1",
	openai: "https://api.openai.com/v1",
	anthropic: "https://api.anthropic.com/v1",
	groq: "https://api.groq.com/openai/v1",
	cerebras: "https://api.cerebras.ai/v1",
	ollama: "http://localhost:11434/v1",
	};
	baseUrl = providers[config.apiProvider] \|\| FALLBACK_URL;
	}

	const resolvedModel = aliasMap[config.model] \|\| config.model \|\| FALLBACK_MODEL;
	console.log(`[agent] resolveApiConfig: custom provider. URL=${baseUrl}, model=${resolvedModel}`);
	return {
	url: `${baseUrl.replace(/\/$/, "")}/chat/completions`,
	key: config.apiKey,
	model: resolvedModel,
	};
	}

	/**
	* Send an SSE event to the client
	*/
	function sendSSE(res: Response, event: string, data: unknown) {
	try {
	res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`);
	} catch {
	// Connection may be closed
	}
	}

	/**
	* Run the agentic loop: send messages to LLM, execute tool calls, repeat.
	* This is the core of the agent — it loops until the LLM stops calling tools.
	*/
	export async function runAgentLoop(
	messages: AgentMessage[],
	sessionId: number,
	config: Partial<AgentConfig>,
	res: Response,
	signal?: AbortSignal
	): Promise<{
	finalMessages: AgentMessage[];
	totalPromptTokens: number;
	totalCompletionTokens: number;
	totalCost: number;
	model: string;
	}> {
	const cfg = { ...DEFAULT_CONFIG, ...config };
	const apiConfig = resolveApiConfig(cfg);
	const workDir = cfg.workDir \|\| "/home/ubuntu";

	// Get plan mode state
	const planState = getPlanMode(sessionId);

	// Read git status and diff (matches original ProjectContext::discover_with_git)
	const gitStatus = readGitStatus(workDir);
	const gitDiff = readGitDiff(workDir);

	// Build system prompt with full environment context
	const systemPrompt = buildSystemPrompt({
	memory: cfg.memory,
	effortLevel: cfg.effortLevel \|\| "high",
	planMode: planState.active,
	planSteps: planState.steps,
	customSystemPrompt: cfg.systemPrompt,
	workDir,
	platform: "linux",
	model: apiConfig.model,
	gitStatus,
	gitDiff,
	});

	// Initialize UsageTracker (matches original conversation.rs)
	const usageTracker = UsageTracker.new();

	// Build conversation with system message first
	const conversationMessages: AgentMessage[] = [
	{ role: "system", content: systemPrompt },
	...messages.filter((m) => m.role !== "system"),
	];

	let totalPromptTokens = 0;
	let totalCompletionTokens = 0;
	let totalCost = 0;
	let iterations = 0;
	let emptyResponseRetries = 0;
	const MAX_EMPTY_RETRIES = 3;
	// Safety limit: prevent infinite loops. Original claw-code uses usize::MAX but that
	// causes runaway loops with Qwen3 which sometimes fails to stop generating.
	// 200 iterations is more than enough for any real task.
	const MAX_ITERATIONS = cfg.maxIterations \|\| 200;
	const assistantMessages: AgentMessage[] = [];
	const toolResultMessages: AgentMessage[] = [];

	// ─── Loop detection: minimal safety net ─────────────────────────────
	// Only detect EXACT same tool+args repeated 5+ times (true infinite loop).
	// Everything else is handled by MAX_ITERATIONS.
	const recentToolSignatures: string[] = [];
	const MAX_EXACT_REPEATS = 5;

	// ─── MCP Tools Dynamic Injection (matches original claw-code) ──────────
	// Initialize MCP servers from config and merge discovered tools with static TOOL_DEFINITIONS.
	// This is how the original claw-code dynamically builds the tool list:
	// 1. Load MCP server configs from .claw/settings.json
	// 2. Connect to each server via stdio JSON-RPC
	// 3. Call tools/list to discover available tools
	// 4. Prefix tool names as mcp__servername__toolname
	// 5. Merge with static tool definitions
	let allTools = [...TOOL_DEFINITIONS];
	try {
	const mcpTools = await initializeMcpFromConfig(workDir);
	if (mcpTools.length > 0) {
	const mcpManager = getMcpManager();
	if (mcpManager) {
	const mcpDefs = mcpManager.getToolDefinitions();
	// Convert MCP tool format to OpenAI function calling format
	const mcpToolDefs = mcpDefs.map((t) => ({
	type: "function" as const,
	function: {
	name: t.name,
	description: t.description,
	parameters: t.input_schema \|\| { type: "object", properties: {} },
	},
	}));
	allTools = [...TOOL_DEFINITIONS, ...mcpToolDefs];
	console.log(`[agent] MCP tools injected: ${mcpDefs.map((t) => t.name).join(", ")}`);
	sendSSE(res, "status", {
	status: "mcp_ready",
	message: `MCP tools loaded: ${mcpDefs.length} tools from ${mcpManager.getConnectedServers().length} servers`,
	});
	}
	}
	} catch (err: any) {
	console.error(`[agent] MCP initialization error (non-fatal):`, err.message);
	// MCP init failure is non-fatal — agent continues with static tools only
	}

	// ─── Context-aware compaction config ──────────────────────────────
	// Original claw-code uses percentage-based thresholds, not fixed 10k tokens.
	// We compute the threshold as 70% of the model's context window.
	const contextWindow = MODEL_CONTEXT_WINDOWS[apiConfig.model] \|\| DEFAULT_CONTEXT_WINDOW;
	const dynamicCompactionConfig: import("./compact").CompactionConfig = {
	preserveRecentMessages: DEFAULT_COMPACTION_CONFIG.preserveRecentMessages,
	maxEstimatedTokens: Math.floor(contextWindow * 0.7),
	};

	sendSSE(res, "status", { status: "thinking", message: "Processing your request..." });

	while (iterations < MAX_ITERATIONS) {
	iterations++;

	if (signal?.aborted) {
	sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" });
	break;
	}

	// Build API request
	// Determine max_tokens limit based on provider
	const isDeepInfra = apiConfig.url.includes("deepinfra.com");
	const isHuggingFace = apiConfig.url.includes("huggingface.co");
	const maxTokensLimit = isHuggingFace ? 32000 : (isDeepInfra ? 65536 : 65536);

	// Detect if model supports thinking/reasoning mode (Qwen3 Thinking, DeepSeek-R1)
	const isThinkingModel = apiConfig.model.includes("Thinking") \|\| apiConfig.model.includes("R1");

	const payload: Record<string, unknown> = {
	model: apiConfig.model,
	messages: conversationMessages.map((m) => {
	const msg: Record<string, unknown> = { role: m.role, content: m.content };
	if (m.tool_calls) msg.tool_calls = m.tool_calls;
	if (m.tool_call_id) msg.tool_call_id = m.tool_call_id;
	if (m.name) msg.name = m.name;
	return msg;
	}),
	tools: allTools,
	// Match original: always "auto" — let the model decide what tools to use
	tool_choice: "auto",
	max_tokens: Math.min(cfg.maxTokens, maxTokensLimit),
	temperature: isThinkingModel ? 0.6 : cfg.temperature, // thinking models need lower temp
	top_p: cfg.topP,
	stream: true,
	};

	// Enable thinking/reasoning for supported models (Qwen3 Thinking, DeepSeek-R1)
	if (isThinkingModel && isDeepInfra) {
	(payload as any).extra_body = { enable_thinking: true };
	}

	// Qwen3-Coder: hybrid thinking mode — model decides when to think
	// DO NOT disable thinking; Qwen3 automatically uses <think> blocks for complex tasks
	// The /no_think in system prompt is only a soft hint for simple responses,
	// the model will override it when it needs to reason through tool calls or code

	// NOTE: Prefill technique (empty assistant message) was removed because it caused
	// the model to "talk to itself" — seeing an empty assistant message made it think
	// it already responded and continue generating follow-up messages in a loop.
	// Anti-censorship is handled purely via system prompt directives instead.

	sendSSE(res, "message_start", { iteration: iterations });

	try {
	// ─── Retry loop with INFINITE retries for 429, limited for 5xx ─────
	// 429 = rate limit: the API is telling us to slow down, NOT to give up.
	// We MUST keep retrying with increasing backoff until it works.
	// 500/502/503 = server errors: retry up to MAX_SERVER_RETRIES times.
	let response: globalThis.Response \| null = null;
	let lastErrorText = "";
	let lastStatus = 0;
	let serverRetryCount = 0;
	let rateLimitRetryCount = 0;

	while (true) {
	if (signal?.aborted) break;

	try {
	response = await fetch(apiConfig.url, {
	method: "POST",
	headers: {
	"Content-Type": "application/json",
	Authorization: `Bearer ${apiConfig.key}`,
	},
	body: JSON.stringify(payload),
	signal,
	});
	} catch (fetchErr: any) {
	// Network error (DNS, connection refused, etc.) — retry every 2s
	if (signal?.aborted) break;
	serverRetryCount++;
	console.error(`[agent] Fetch error (retry #${serverRetryCount}):`, fetchErr.message);
	sendSSE(res, "status", {
	status: "retrying",
	message: `Network error, retrying in 2s... (attempt #${serverRetryCount})`,
	});
	await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
	continue;
	}

	if (response.ok) break;

	lastStatus = response.status;
	lastErrorText = await response.text();

	// ─── 429 Rate Limit: INFINITE retry every 2s ───
	if (response.status === 429) {
	rateLimitRetryCount++;
	console.log(`[agent] Rate limited (429) — retry #${rateLimitRetryCount} in 2s`);
	sendSSE(res, "status", {
	status: "rate_limited",
	message: `Rate limited by API — retrying in 2s... (attempt #${rateLimitRetryCount})`,
	});
	await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
	response = null;
	continue; // NEVER give up on 429
	}

	// ─── 500/502/503 Server errors: INFINITE retry every 2s ───
	if ([500, 502, 503].includes(response.status)) {
	serverRetryCount++;
	console.log(`[agent] Server error ${response.status} — retry #${serverRetryCount} in 2s`);
	sendSSE(res, "status", {
	status: "retrying",
	message: `Server error ${response.status}, retrying in 2s... (attempt #${serverRetryCount})`,
	});
	await new Promise((r) => setTimeout(r, RETRY_DELAY_MS));
	response = null;
	continue;
	}

	// Any other error (400, 401, 403, 404, etc.) — don't retry
	break;
	}

	if (!response \|\| !response.ok) {
	console.error(`[agent] API error ${lastStatus}:`, lastErrorText);
	console.error(`[agent] Payload model:`, apiConfig.model);
	console.error(`[agent] Payload messages count:`, (payload.messages as any[]).length);

	// ─── AUTO-COMPACT on context overflow (400 error) ─────────────
	if (lastStatus === 400 && (lastErrorText.includes("context_length") \|\| lastErrorText.includes("too many tokens") \|\| lastErrorText.includes("maximum context") \|\| lastErrorText.includes("token limit") \|\| lastErrorText.includes("too long"))) {
	console.log(`[agent] Context overflow detected — auto-compacting conversation...`);
	sendSSE(res, "status", {
	status: "compacting",
	message: "Context window exceeded — auto-compacting conversation...",
	});

	try {
	const session = agentMessagesToSession(conversationMessages);

	// LLM-based summarization: use the same API to produce a real summary
	const llmFetch = async (msgs: Array<{role: string; content: string}>) => {
	const summaryResp = await fetch(apiConfig.url, {
	method: "POST",
	headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` },
	body: JSON.stringify({
	model: apiConfig.model,
	messages: msgs,
	max_tokens: 2000,
	temperature: 0.3,
	stream: false,
	}),
	});
	if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`);
	const json = await summaryResp.json();
	return json.choices?.[0]?.message?.content \|\| "";
	};

	const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetch);

	if (compactResult.removedMessageCount > 0) {
	const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession);
	conversationMessages.length = 0;
	conversationMessages.push({ role: "system", content: systemPrompt });
	conversationMessages.push(...compactedAgentMessages);

	console.log(`[agent] Auto-compact (LLM): removed ${compactResult.removedMessageCount} messages, kept ${conversationMessages.length}`);
	sendSSE(res, "auto_compact", {
	removedCount: compactResult.removedMessageCount,
	keptCount: conversationMessages.length,
	summary: compactResult.formattedSummary,
	});
	continue; // retry with compacted context
	} else {
	console.error(`[agent] Auto-compact produced no reduction — breaking`);
	sendSSE(res, "error", {
	message: `Context overflow but compaction couldn't reduce further`,
	details: lastErrorText,
	});
	break;
	}
	} catch (compactErr: any) {
	console.error(`[agent] Auto-compact failed:`, compactErr.message);
	sendSSE(res, "error", {
	message: `Context overflow — auto-compact failed: ${compactErr.message}`,
	details: lastErrorText,
	});
	break;
	}
	}

	// Non-context 400 errors — log details for debugging
	if (lastStatus === 400) {
	console.error(`[agent] Full error body:`, lastErrorText);
	(payload.messages as any[]).forEach((m: any, i: number) => {
	console.error(`[agent] msg[${i}] role=${m.role} content_type=${typeof m.content} content_len=${String(m.content \|\| '').length} has_tool_calls=${!!m.tool_calls} has_tool_call_id=${!!m.tool_call_id}`);
	});
	}
	sendSSE(res, "error", {
	message: `API error: ${lastStatus}${lastStatus === 429 ? ' (rate limit)' : ''} — ${lastErrorText.substring(0, 200)}`,
	details: lastErrorText,
	});
	break;
	}

	// Process streaming response
	let result: { content: string; toolCalls: Array<{ id: string; type: "function"; function: { name: string; arguments: string } }>; usage?: any };
	try {
	result = await processStream(response, res, signal);
	} catch (streamErr: any) {
	// Stream processing error — treat as transient, retry
	console.error(`[agent] Stream processing error:`, streamErr.message);
	if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) {
	sendSSE(res, "status", { status: "retrying", message: `Stream error, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` });
	await new Promise(r => setTimeout(r, 1500));
	continue;
	}
	sendSSE(res, "error", { message: `Stream failed after ${MAX_EMPTY_RETRIES} retries: ${streamErr.message}` });
	break;
	}

	// ─── Bug #1 fix: Handle empty LLM response with retry ─────────
	// Original claw-code retries on empty response instead of crashing.
	// Open-source models via HuggingFace often return empty streams.
	if (!result.content && result.toolCalls.length === 0) {
	if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) {
	console.warn(`[agent] Empty response from LLM — retry ${emptyResponseRetries}/${MAX_EMPTY_RETRIES}`);
	sendSSE(res, "status", { status: "retrying", message: `Empty response from model, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` });
	await new Promise(r => setTimeout(r, 1500));
	continue; // retry same iteration
	}
	console.error(`[agent] LLM returned empty response ${MAX_EMPTY_RETRIES} times — giving up`);
	sendSSE(res, "error", { message: `Model returned empty response after ${MAX_EMPTY_RETRIES} retries. Try a different model or reduce context.` });
	break;
	}
	emptyResponseRetries = 0; // reset on successful response

	// Track usage with UsageTracker (matches original)
	if (result.usage) {
	totalPromptTokens += result.usage.prompt_tokens \|\| 0;
	totalCompletionTokens += result.usage.completion_tokens \|\| 0;
	usageTracker.record({
	input_tokens: result.usage.prompt_tokens \|\| 0,
	output_tokens: result.usage.completion_tokens \|\| 0,
	cache_creation_input_tokens: result.usage.cache_creation_input_tokens \|\| 0,
	cache_read_input_tokens: result.usage.cache_read_input_tokens \|\| 0,
	});
	}

	// Add assistant message to conversation
	const assistantMessage: AgentMessage = {
	role: "assistant",
	// Match original: null when no content (even with tool_calls)
	content: result.content \|\| null,
	};
	if (result.toolCalls && result.toolCalls.length > 0) {
	assistantMessage.tool_calls = result.toolCalls;
	}
	conversationMessages.push(assistantMessage);
	assistantMessages.push(assistantMessage);

	// If no tool calls, we're done — the LLM has finished responding.
	// This matches the original claw-code behavior exactly:
	// the model decides when to stop by not calling tools.
	if (!result.toolCalls \|\| result.toolCalls.length === 0) {
	sendSSE(res, "message_end", {
	promptTokens: totalPromptTokens,
	completionTokens: totalCompletionTokens,
	cost: totalCost,
	model: apiConfig.model,
	});
	break;
	}

	// ─── Minimal loop detection: only catch TRUE infinite loops ───────
	// Only break if the EXACT same tool+args is repeated 5+ times.
	// This is the only safety net beyond MAX_ITERATIONS.
	// The original claw-code has NO loop detection at all — it trusts the model.
	const currentToolSig = result.toolCalls.map((tc: any) => `${tc.function.name}:${tc.function.arguments}`).join("\|");
	recentToolSignatures.push(currentToolSig);
	if (recentToolSignatures.length > MAX_EXACT_REPEATS) {
	recentToolSignatures.shift();
	}
	if (recentToolSignatures.length >= MAX_EXACT_REPEATS) {
	const allSame = recentToolSignatures.every(r => r === recentToolSignatures[0]);
	if (allSame) {
	console.warn(`[agent] Infinite loop detected: exact same tool call repeated ${MAX_EXACT_REPEATS} times — breaking`);
	sendSSE(res, "error", {
	message: `⚠️ обнаружен бесконечный цикл. попробуй переформулировать запрос`,
	});
	sendSSE(res, "message_end", {
	promptTokens: totalPromptTokens,
	completionTokens: totalCompletionTokens,
	cost: totalCost,
	model: apiConfig.model,
	});
	break;
	}
	}

	// ─── Execute tool calls ──────────────────────────────────────────
	// Bug #2+#3 fix: Each tool call is wrapped in its own try-catch.
	// Original claw-code sends tool errors back to LLM as tool results,
	// letting the model decide how to handle them. We NEVER break the
	// loop on a tool error — only on fatal API/stream errors.
	for (const toolCall of result.toolCalls) {
	const toolName = toolCall.function.name;
	let toolArgs: Record<string, unknown> = {};
	let argParseError = false;
	try {
	toolArgs = JSON.parse(toolCall.function.arguments \|\| "{}");
	} catch (parseErr: any) {
	// Try JSON repair before giving up
	try {
	const { jsonrepair } = await import("jsonrepair");
	const repaired = jsonrepair(toolCall.function.arguments \|\| "{}");
	toolArgs = JSON.parse(repaired);
	console.info(`[agent] Repaired malformed JSON for ${toolName}`);
	} catch (repairErr: any) {
	argParseError = true;
	console.warn(`[agent] Malformed tool args for ${toolName} (repair failed):`, toolCall.function.arguments?.substring(0, 200));
	}
	}

	sendSSE(res, "tool_call_start", {
	id: toolCall.id,
	name: toolName,
	arguments: toolCall.function.arguments,
	});

	let toolOutput: string;
	let isError = false;

	// If JSON args were malformed, skip execution and tell LLM to fix
	if (argParseError) {
	toolOutput = `Error: Your tool call arguments for '${toolName}' contained malformed JSON. The raw arguments were: ${(toolCall.function.arguments \|\| "").substring(0, 500)}. Please fix the JSON and try again.`;
	isError = true;
	} else try {
	// ─── Pre-tool hooks (matches original HookRunner.run_pre_tool_use) ──
	const preHookResult = await runPreToolHooks(toolName, sessionId, toolArgs, workDir);

	if (!preHookResult.allowed) {
	// Hook denied the tool execution (exit code 2 = deny)
	toolOutput = preHookResult.message \|\| `Tool '${toolName}' was denied by pre-tool hook`;
	isError = true;
	sendSSE(res, "permission_denied", {
	toolName,
	toolCallId: toolCall.id,
	reason: toolOutput,
	needsPrompt: false,
	});
	} else {
	// Execute the tool with the correct working directory
	const toolResult = await executeTool(toolName, toolArgs, sessionId, workDir);
	if (toolResult.isError && toolResult.output.includes("needs one-time approval")) {
	sendSSE(res, "permission_prompt", {
	toolName,
	toolCallId: toolCall.id,
	reason: toolResult.output,
	});
	}
	toolOutput = toolResult.output;
	isError = toolResult.isError \|\| false;

	// Merge pre-hook feedback (matches original merge_hook_feedback)
	if (preHookResult.message) {
	toolOutput = mergeHookFeedback([preHookResult.message], toolOutput, false);
	}

	// ─── Post-tool hooks (matches original HookRunner.run_post_tool_use) ──
	const postHookResult = await runPostToolHooks(toolName, sessionId, toolResult, workDir);
	toolOutput = postHookResult.output;
	isError = postHookResult.isError \|\| false;
	}
	} catch (toolExecError: any) {
	// ─── Bug #3 fix: Tool exception → error result for LLM ──────
	// Original claw-code: tool errors become tool results, NOT loop breaks.
	// The LLM sees the error and can try a different approach.
	console.error(`[agent] Tool '${toolName}' threw exception:`, toolExecError.message);
	toolOutput = `Tool execution error: ${toolExecError.message}`;
	isError = true;
	}

	// No error classification or guidance injection.
	// The model receives raw error output and decides how to handle it.
	// This matches the original claw-code behavior.

	sendSSE(res, "tool_result", {
	toolCallId: toolCall.id,
	toolName,
	output: toolOutput,
	isError,
	durationMs: 0,
	});

	// ─── Special SSE events for interactive tools ────────────────

	// SendUserMessage / Brief: emit SSE for frontend display but DO NOT break the loop.
	// Original claw-code does NOT stop on SendUserMessage — the model can send
	// progress updates ("checking...", "found vulnerability...") AND continue working.
	// Breaking here was the #1 cause of the agent stopping mid-task.
	if ((toolName === "SendUserMessage" \|\| toolName === "Brief" \|\| toolName === "ask_user") && !isError) {
	sendSSE(res, "assistant_message", {
	message: toolArgs.message \|\| toolArgs.question \|\| "",
	attachments: toolArgs.attachments \|\| [],
	});
	}

	// Plan/Todo tools: emit plan state updates
	if (["TodoWrite", "plan_create", "plan_update", "enter_plan_mode", "exit_plan_mode"].includes(toolName)) {
	const updatedPlan = getPlanMode(sessionId);
	sendSSE(res, "plan_update", {
	active: updatedPlan.active,
	steps: updatedPlan.steps,
	});
	}

	// Add tool result to conversation for the LLM to process
	const toolResultMsg: AgentMessage = {
	role: "tool",
	content: toolOutput,
	tool_call_id: toolCall.id,
	name: toolName,
	};
	conversationMessages.push(toolResultMsg);
	toolResultMessages.push(toolResultMsg);
	}

	// No consecutive error detection — the model handles errors naturally.
	// MAX_ITERATIONS (200) is the ultimate safety net.
	// SendUserMessage does NOT break the loop (matches original).

	// ─── Proactive auto-compact check ─────────────────────────────
	// Check if conversation is approaching context window limit and compact proactively
	const estimatedTokens = estimateConversationTokens(conversationMessages);
	// contextWindow already computed above (line 397)
	const contextUsagePercent = Math.round((estimatedTokens / contextWindow) * 100);

	// Emit context usage SSE for frontend tracking
	sendSSE(res, "context_usage", {
	estimatedTokens,
	contextWindow,
	usagePercent: contextUsagePercent,
	messageCount: conversationMessages.length,
	});

	// Proactive compaction at 80% context usage
	if (contextUsagePercent >= 80) {
	console.log(`[agent] Context at ${contextUsagePercent}% — proactive auto-compact`);
	sendSSE(res, "status", {
	status: "compacting",
	message: `Context at ${contextUsagePercent}% — auto-compacting to free space...`,
	});

	try {
	const session = agentMessagesToSession(conversationMessages);

	// LLM-based summarization for proactive compaction
	const llmFetchProactive = async (msgs: Array<{role: string; content: string}>) => {
	const summaryResp = await fetch(apiConfig.url, {
	method: "POST",
	headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` },
	body: JSON.stringify({
	model: apiConfig.model,
	messages: msgs,
	max_tokens: 2000,
	temperature: 0.3,
	stream: false,
	}),
	});
	if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`);
	const json = await summaryResp.json();
	return json.choices?.[0]?.message?.content \|\| "";
	};

	const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetchProactive);
	if (compactResult.removedMessageCount > 0) {
	const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession);
	conversationMessages.length = 0;
	// CRITICAL: Re-prepend original system prompt before compacted summary.
	conversationMessages.push({ role: "system", content: systemPrompt });
	conversationMessages.push(...compactedAgentMessages);

	// Inject current todo/plan state so the agent doesn't lose its plan after compaction
	const todoState = (() => {
	try {
	const executor = require("../tools/executor");
	const plan = executor.getPlanMode(sessionId);
	const todos = executor.todoLists?.get?.(sessionId) \|\| [];
	let state = "";
	if (todos.length > 0) {
	state += "\n\n[PRESERVED TODO LIST]\n" + todos.map((t: any, i: number) => {
	const icon = t.status === "completed" ? "\u2713" : t.status === "in_progress" ? "\u25cf" : "\u25cb";
	return ` ${icon} ${i + 1}. ${t.content} [${t.status}]`;
	}).join("\n");
	}
	if (plan?.active && plan.steps?.length > 0) {
	state += "\n\n[PRESERVED PLAN]\n" + plan.steps.map((s: any) => {
	const icon = s.status === "done" ? "\u2713" : s.status === "in_progress" ? "\u25cf" : "\u25a1";
	return ` ${icon} ${s.id}. ${s.text} [${s.status}]`;
	}).join("\n");
	}
	return state;
	} catch { return ""; }
	})();

	if (todoState) {
	// Append todo state to the last user/system message so the model sees it
	const lastMsg = conversationMessages[conversationMessages.length - 1];
	if (lastMsg && typeof lastMsg.content === "string") {
	lastMsg.content += todoState;
	}
	}

	sendSSE(res, "auto_compact", {
	removedCount: compactResult.removedMessageCount,
	keptCount: conversationMessages.length,
	summary: compactResult.formattedSummary,
	});
	console.log(`[agent] Proactive compact: removed ${compactResult.removedMessageCount} messages`);
	}
	} catch (compactErr: any) {
	console.error(`[agent] Proactive compact failed (non-fatal):`, compactErr.message);
	}
	}

	// ─── Buddy events SSE ────────────────────────────────────────────
	// Emit buddy_event for each tool call so frontend can award XP
	for (const toolCall of result.toolCalls) {
	const tn = toolCall.function.name;
	sendSSE(res, "buddy_event", {
	type: "tool_call",
	toolName: tn,
	iteration: iterations,
	});
	// Special buddy events for file creation
	if (tn === "write_file" \|\| tn === "create_file") {
	sendSSE(res, "buddy_event", {
	type: "file_created",
	toolName: tn,
	iteration: iterations,
	});
	}
	}

	// Continue the loop — LLM will see tool results and decide next action
	sendSSE(res, "status", {
	status: "thinking",
	message: `Processing tool results (iteration ${iterations}, context: ${contextUsagePercent}%)...`,
	});
	} catch (error: any) {
	// ─── Bug #2 fix: Distinguish fatal vs transient errors ────────
	// Only AbortError and unrecoverable errors should break the loop.
	// Stream/fetch errors are already handled above with retry logic.
	if (error.name === "AbortError" \|\| signal?.aborted) {
	sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" });
	break;
	}
	// For other errors, log and break (these are truly unexpected)
	console.error(`[agent] Unexpected error in agent loop:`, error.message, error.stack);
	sendSSE(res, "error", { message: error.message \|\| "Unknown error" });
	break;
	}
	}

	if (iterations >= MAX_ITERATIONS) {
	sendSSE(res, "error", { message: `Maximum iterations (${MAX_ITERATIONS}) reached. Use /compact to reduce context and continue.` });
	}

	// ─── Buddy: session_completed event ─────────────────────────────────
	// Emit session_completed so Buddy can award XP for finishing a turn
	sendSSE(res, "buddy_event", {
	type: "session_completed",
	iterations,
	toolCallCount: toolResultMessages.length,
	});

	// Calculate cost using UsageTracker (matches original)
	const cumulativeUsage = usageTracker.cumulativeUsage();
	const modelPricing = pricingForModel(apiConfig.model) ?? defaultSonnetTierPricing();
	const costEstimate = estimateCostUsdWithPricing(cumulativeUsage, modelPricing);
	totalCost = totalCostUsd(costEstimate);

	// Emit usage summary lines (matches original summary_lines_for_model)
	const usageSummary = summaryLinesForModel(cumulativeUsage, "session", apiConfig.model);
	sendSSE(res, "usage", {
	promptTokens: totalPromptTokens,
	completionTokens: totalCompletionTokens,
	totalTokens: totalPromptTokens + totalCompletionTokens,
	cost: totalCost,
	cacheCreationTokens: cumulativeUsage.cache_creation_input_tokens,
	cacheReadTokens: cumulativeUsage.cache_read_input_tokens,
	usageSummary,
	turns: usageTracker.turns(),
	formattedCost: formatUsd(totalCost),
	});

	return {
	finalMessages: conversationMessages.filter((m) => m.role !== "system"),
	totalPromptTokens,
	totalCompletionTokens,
	totalCost,
	model: apiConfig.model,
	};
	}

	/**
	* Process a streaming response from the LLM API (OpenAI-compatible SSE format)
	*/
	async function processStream(
	response: globalThis.Response,
	res: Response,
	signal?: AbortSignal
	): Promise<{
	content: string;
	toolCalls: Array<{
	id: string;
	type: "function";
	function: { name: string; arguments: string };
	}>;
	usage?: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number };
	}> {
	const reader = response.body?.getReader();
	if (!reader) throw new Error("No response body");

	const decoder = new TextDecoder();
	let content = "";
	const toolCalls: Map<
	number,
	{ id: string; type: "function"; function: { name: string; arguments: string } }
	> = new Map();
	let usage: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number } \| undefined;
	let buffer = "";

	try {
	while (true) {
	if (signal?.aborted) break;

	const { done, value } = await reader.read();
	if (done) break;

	buffer += decoder.decode(value, { stream: true });

	// Process complete SSE lines
	const lines = buffer.split("\n");
	buffer = lines.pop() \|\| "";

	for (const line of lines) {
	if (!line.startsWith("data: ")) continue;
	const data = line.slice(6).trim();
	if (data === "[DONE]") continue;

	try {
	const chunk = JSON.parse(data);
	const delta = chunk.choices?.[0]?.delta;

	// Detect API errors returned inside the SSE stream (e.g. DeepInfra "Operation not allowed")
	if (chunk.error) {
	const errMsg = chunk.error.message \|\| chunk.error.type \|\| JSON.stringify(chunk.error);
	console.error(`[agent] API error in stream: ${errMsg}`);
	throw new Error(`API error in stream: ${errMsg}`);
	}

	if (!delta) {
	if (chunk.usage) {
	usage = {
	prompt_tokens: chunk.usage.prompt_tokens \|\| 0,
	completion_tokens: chunk.usage.completion_tokens \|\| 0,
	cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens \|\| 0,
	cache_read_input_tokens: chunk.usage.cache_read_input_tokens \|\| 0,
	};
	}
	continue;
	}

	// Reasoning/thinking content (Qwen3 Thinking, DeepSeek-R1)
	// These models return reasoning in delta.reasoning_content before the actual response
	if (delta.reasoning_content) {
	sendSSE(res, "thinking_delta", { text: delta.reasoning_content });
	}

	// Text content streaming
	if (delta.content) {
	content += delta.content;
	sendSSE(res, "text_delta", { text: delta.content });
	}

	// Tool call streaming
	if (delta.tool_calls) {
	for (const tc of delta.tool_calls) {
	const idx = tc.index ?? 0;
	if (!toolCalls.has(idx)) {
	toolCalls.set(idx, {
	id: tc.id \|\| `call_${idx}_${Date.now()}`,
	type: "function",
	function: { name: tc.function?.name \|\| "", arguments: "" },
	});
	}
	const existing = toolCalls.get(idx)!;
	if (tc.id) existing.id = tc.id;
	if (tc.function?.name) existing.function.name = tc.function.name;
	if (tc.function?.arguments) {
	existing.function.arguments += tc.function.arguments;
	sendSSE(res, "tool_call_delta", {
	id: existing.id,
	name: existing.function.name,
	arguments: tc.function.arguments,
	});
	}
	}
	}

	// Usage info
	if (chunk.usage) {
	usage = {
	prompt_tokens: chunk.usage.prompt_tokens \|\| 0,
	completion_tokens: chunk.usage.completion_tokens \|\| 0,
	cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens \|\| 0,
	cache_read_input_tokens: chunk.usage.cache_read_input_tokens \|\| 0,
	};
	}
	} catch (parseErr: any) {
	// Re-throw API errors (these are NOT malformed JSON — they're real errors)
	if (parseErr?.message?.startsWith('API error in stream:')) {
	throw parseErr;
	}
	// Skip genuinely malformed JSON chunks (partial SSE data, etc.)
	}
	}
	}
	} finally {
	reader.releaseLock();
	}

	// Process remaining buffer (last line without trailing \n)
	if (buffer.trim() && buffer.startsWith("data: ")) {
	const data = buffer.slice(6).trim();
	if (data !== "[DONE]") {
	try {
	const chunk = JSON.parse(data);
	const delta = chunk.choices?.[0]?.delta;
	if (delta?.content) {
	content += delta.content;
	sendSSE(res, "text_delta", { text: delta.content });
	}
	if (delta?.tool_calls) {
	for (const tc of delta.tool_calls) {
	const idx = tc.index ?? 0;
	if (!toolCalls.has(idx)) {
	toolCalls.set(idx, {
	id: tc.id \|\| `call_${idx}_${Date.now()}`,
	type: "function",
	function: { name: tc.function?.name \|\| "", arguments: "" },
	});
	}
	const existing = toolCalls.get(idx)!;
	if (tc.id) existing.id = tc.id;
	if (tc.function?.name) existing.function.name = tc.function.name;
	if (tc.function?.arguments) existing.function.arguments += tc.function.arguments;
	}
	}
	if (chunk.usage) {
	usage = {
	prompt_tokens: chunk.usage.prompt_tokens \|\| 0,
	completion_tokens: chunk.usage.completion_tokens \|\| 0,
	cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens \|\| 0,
	cache_read_input_tokens: chunk.usage.cache_read_input_tokens \|\| 0,
	};
	}
	// Check finish_reason for truncation
	const finishReason = chunk.choices?.[0]?.finish_reason;
	if (finishReason === "length") {
	console.warn("[agent] Response truncated (finish_reason=length) — tool call args may be incomplete");
	}
	} catch { /* skip malformed */ }
	}
	}

	// Original claw-code retries on empty response instead of throwing.
	if (content.length === 0 && toolCalls.size === 0) {
	console.warn("[agent] LLM returned empty response — will be retried by agent loop");
	}

	return {
	content,
	toolCalls: Array.from(toolCalls.values()),
	usage,
	};
	}

	/**
	* Estimate cost based on model and token counts
	*/
	function estimateCost(model: string, promptTokens: number, completionTokens: number): number {
	// Pricing per 1M tokens — aligned with original claw-code model registry
	const pricing: Record<string, { input: number; output: number }> = {
	// Claw API / Anthropic
	"claude-opus-4-6": { input: 15.00, output: 75.00 },
	"claude-sonnet-4-6": { input: 3.00, output: 15.00 },
	"claude-haiku-4-5-20251213": { input: 0.80, output: 4.00 },
	// xAI Grok
	"grok-3": { input: 3.00, output: 15.00 },
	"grok-3-mini": { input: 0.30, output: 0.50 },
	"grok-2": { input: 2.00, output: 10.00 },
	// OpenAI
	"gpt-5.4": { input: 2.50, output: 15.00 },
	"gpt-5.4-mini": { input: 0.40, output: 1.60 },
	"gpt-5.3-codex": { input: 2.50, output: 10.00 },
	"gpt-4.1": { input: 2.00, output: 8.00 },
	"gpt-4.1-mini": { input: 0.40, output: 1.60 },
	"o3": { input: 10.00, output: 40.00 },
	"o4-mini": { input: 1.10, output: 4.40 },
	// HuggingFace Inference API (free tier = $0, Pro tier = included in subscription)
	"XiaomiMiMo/MiMo-V2-Flash": { input: 0.00, output: 0.00 },
	"Qwen/Qwen3-Coder-Next": { input: 0.00, output: 0.00 },
	"Qwen/Qwen3-8B": { input: 0.00, output: 0.00 },
	"Qwen/Qwen3-Coder-30B-A3B-Instruct": { input: 0.00, output: 0.00 },
	"meta-llama/Llama-3.3-70B-Instruct": { input: 0.00, output: 0.00 },
	"deepseek-ai/DeepSeek-V3.2": { input: 0.00, output: 0.00 },
	"deepseek-ai/DeepSeek-R1": { input: 0.00, output: 0.00 },
	// OpenRouter variants
	"anthropic/claude-opus-4-6": { input: 15.00, output: 75.00 },
	"anthropic/claude-sonnet-4-6": { input: 3.00, output: 15.00 },
	"google/gemini-2.5-pro": { input: 1.25, output: 10.00 },
	"google/gemini-2.5-flash": { input: 0.15, output: 0.60 },
	};

	const rates = pricing[model] \|\| { input: 1.00, output: 3.00 };
	return (promptTokens * rates.input + completionTokens * rates.output) / 1_000_000;
	}