Spaces:
Sleeping
Sleeping
Claw Web
CRITICAL FIX: SendUserMessage no longer breaks agent loop + match original behavior
5d6ab9e | /** | |
| * Claw Agent Runtime — the core agentic conversation loop. | |
| * Handles streaming responses, tool calls, and multi-turn conversations. | |
| */ | |
| import { ENV } from "../_core/env"; | |
| import { buildSystemPrompt, TOOL_DEFINITIONS } from "./system-prompt"; | |
| import { executeTool, getPlanMode, runPreToolHooks, runPostToolHooks, initializeMcpFromConfig, getMcpManager } from "../tools/executor"; | |
| import { compactSession, compactSessionWithLLM, shouldCompact, estimateSessionTokens, dbMessagesToSession, DEFAULT_COMPACTION_CONFIG } from "./compact"; | |
| import type { Session, ConversationMessage as CompactMessage, CompactionConfig } from "./compact"; | |
| import { UsageTracker, pricingForModel, defaultSonnetTierPricing, estimateCostUsdWithPricing, totalCostUsd, formatUsd, summaryLinesForModel } from "./usage"; | |
| import type { TokenUsage } from "./usage"; | |
| import type { Response } from "express"; | |
| import { execSync } from "child_process"; | |
| // In original claw-code, max_iterations defaults to usize::MAX (effectively unlimited). | |
| // Auto-compact is triggered on context overflow (400 error) — matches original compact() method. | |
| // Context window sizes for known models (used for proactive compaction) | |
| const MODEL_CONTEXT_WINDOWS: Record<string, number> = { | |
| // Xiaomi MiMo | |
| "XiaomiMiMo/MiMo-V2-Flash": 262144, | |
| // Qwen models (DeepInfra + HuggingFace) | |
| "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo": 262144, | |
| "Qwen/Qwen3-Coder-480B-A35B-Instruct": 262144, | |
| "Qwen/Qwen3-235B-A22B-Instruct-2507": 262144, | |
| "Qwen/Qwen3-235B-A22B-Thinking-2507": 262144, | |
| "Qwen/Qwen3.5-397B-A17B": 262144, | |
| "Qwen/Qwen3.5-122B-A10B": 262144, | |
| "Qwen/Qwen3-Coder-Next": 131072, | |
| "Qwen/Qwen3-32B": 40960, | |
| "Qwen/Qwen3-8B": 32768, | |
| "Qwen/Qwen3-Coder-30B-A3B-Instruct": 131072, | |
| // Meta Llama | |
| "meta-llama/Llama-3.3-70B-Instruct": 131072, | |
| "meta-llama/Llama-4-Maverick-17B-128E": 1048576, | |
| "meta-llama/Llama-4-Scout-17B-16E": 327680, | |
| // DeepSeek | |
| "deepseek-ai/DeepSeek-V3.2": 163840, | |
| "deepseek-ai/DeepSeek-V3.1": 163840, | |
| "deepseek-ai/DeepSeek-R1": 131072, | |
| "deepseek-ai/DeepSeek-R1-0528": 163840, | |
| // NVIDIA Nemotron | |
| "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B": 262144, | |
| // StepFun | |
| "stepfun-ai/Step-3.5-Flash": 262144, | |
| // NousResearch (uncensored) | |
| "NousResearch/Hermes-3-Llama-3.1-70B": 131072, | |
| "NousResearch/Hermes-3-Llama-3.1-405B": 131072, | |
| // Anthropic | |
| "claude-opus-4-6": 200000, | |
| "claude-sonnet-4-6": 200000, | |
| // OpenAI | |
| "gpt-5.4": 1048576, | |
| "gpt-4.1": 1048576, | |
| // xAI | |
| "grok-3": 131072, | |
| "google/gemini-2.5-flash": 1000000, | |
| "google/gemini-2.5-pro": 1000000, | |
| }; | |
| const DEFAULT_CONTEXT_WINDOW = 131072; | |
| /** | |
| * Convert agent messages to compact.ts Session format for compaction. | |
| */ | |
| function agentMessagesToSession(messages: AgentMessage[]): Session { | |
| return dbMessagesToSession( | |
| messages.map((m) => ({ | |
| role: m.role, | |
| content: m.content || "", | |
| toolName: m.name || null, | |
| toolCallId: m.tool_call_id || null, | |
| })) | |
| ); | |
| } | |
| /** | |
| * Convert compacted Session back to AgentMessage[] format. | |
| */ | |
| function sessionToAgentMessages(session: Session): AgentMessage[] { | |
| return session.messages.map((msg) => { | |
| const agentMsg: AgentMessage = { | |
| role: msg.role, | |
| content: msg.blocks | |
| .filter((b) => b.type === "text") | |
| .map((b) => b.text || "") | |
| .join("\n") || null, | |
| }; | |
| // Reconstruct tool_calls from tool_use blocks | |
| const toolUseBlocks = msg.blocks.filter((b) => b.type === "tool_use"); | |
| if (toolUseBlocks.length > 0) { | |
| agentMsg.tool_calls = toolUseBlocks.map((b, i) => ({ | |
| id: `compacted_${i}_${Date.now()}`, | |
| type: "function" as const, | |
| function: { | |
| name: b.name || "unknown", | |
| arguments: b.input || "{}", | |
| }, | |
| })); | |
| } | |
| // Reconstruct tool result fields | |
| const toolResultBlock = msg.blocks.find((b) => b.type === "tool_result"); | |
| if (toolResultBlock) { | |
| agentMsg.name = toolResultBlock.toolName; | |
| agentMsg.content = toolResultBlock.output || ""; | |
| } | |
| return agentMsg; | |
| }); | |
| } | |
| /** | |
| * Estimate total tokens in the conversation (simple heuristic: ~4 chars per token). | |
| */ | |
| function estimateConversationTokens(messages: AgentMessage[]): number { | |
| let total = 0; | |
| for (const msg of messages) { | |
| total += Math.ceil((msg.content?.length || 0) / 4) + 4; // +4 for role/overhead | |
| if (msg.tool_calls) { | |
| for (const tc of msg.tool_calls) { | |
| total += Math.ceil((tc.function.name.length + tc.function.arguments.length) / 4) + 4; | |
| } | |
| } | |
| } | |
| return total; | |
| } | |
| interface AgentMessage { | |
| role: "system" | "user" | "assistant" | "tool"; | |
| content: string | null; | |
| tool_calls?: Array<{ | |
| id: string; | |
| type: "function"; | |
| function: { name: string; arguments: string }; | |
| }>; | |
| tool_call_id?: string; | |
| name?: string; | |
| } | |
| interface AgentConfig { | |
| model: string; | |
| apiProvider: string; | |
| apiKey?: string | null; | |
| apiBaseUrl?: string | null; | |
| maxTokens: number; | |
| temperature: number; | |
| topP: number; | |
| systemPrompt?: string | null; | |
| memory?: string | null; | |
| workDir?: string; | |
| effortLevel?: "low" | "medium" | "high"; | |
| maxIterations?: number; | |
| } | |
| /** | |
| * TurnSummary — matches original conversation.rs TurnSummary struct. | |
| * Returned after each complete agent turn. | |
| */ | |
| export interface TurnSummary { | |
| assistantMessages: AgentMessage[]; | |
| toolResults: AgentMessage[]; | |
| iterations: number; | |
| usage: TokenUsage; | |
| } | |
| /** | |
| * Read git status (matches original read_git_status from prompt.rs) | |
| */ | |
| function readGitStatus(cwd: string): string | null { | |
| try { | |
| const output = execSync("git --no-optional-locks status --short --branch", { | |
| cwd, | |
| timeout: 5000, | |
| encoding: "utf-8", | |
| stdio: ["pipe", "pipe", "pipe"], | |
| }).trim(); | |
| return output || null; | |
| } catch { | |
| return null; | |
| } | |
| } | |
| /** | |
| * Read git diff (matches original read_git_diff from prompt.rs) | |
| */ | |
| function readGitDiff(cwd: string): string | null { | |
| try { | |
| const sections: string[] = []; | |
| try { | |
| const staged = execSync("git diff --cached", { | |
| cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], | |
| }).trim(); | |
| if (staged) sections.push(`Staged changes:\n${staged}`); | |
| } catch {} | |
| try { | |
| const unstaged = execSync("git diff", { | |
| cwd, timeout: 5000, encoding: "utf-8", stdio: ["pipe", "pipe", "pipe"], | |
| }).trim(); | |
| if (unstaged) sections.push(`Unstaged changes:\n${unstaged}`); | |
| } catch {} | |
| return sections.length > 0 ? sections.join("\n\n") : null; | |
| } catch { | |
| return null; | |
| } | |
| } | |
| /** | |
| * Merge hook feedback into tool output — matches original merge_hook_feedback() | |
| */ | |
| function mergeHookFeedback(hookMessages: string[], output: string, denied: boolean): string { | |
| if (hookMessages.length === 0) return output; | |
| const sections: string[] = []; | |
| if (output.trim()) sections.push(output); | |
| const label = denied ? "Hook feedback (denied)" : "Hook feedback"; | |
| sections.push(`${label}:\n${hookMessages.join("\n")}`); | |
| return sections.join("\n\n"); | |
| } | |
| const DEFAULT_CONFIG: AgentConfig = { | |
| model: process.env.DEFAULT_MODEL || "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", | |
| apiProvider: "deepinfra", | |
| maxTokens: 32768, // Qwen3-Coder supports up to 65k output | |
| temperature: 0.5, // Lower temp = more focused/deterministic agent behavior | |
| topP: 0.95, // Slightly restricted for more coherent tool calls | |
| workDir: process.env.WORKSPACE_DIR || "/home/ubuntu", | |
| effortLevel: "high", | |
| }; | |
| /** | |
| * Retry config for transient API errors. | |
| * - 429 (rate limit): retry INFINITELY every 2 seconds until it works. | |
| * - 500/502/503 (server errors): retry INFINITELY every 2 seconds. | |
| * - Network errors: retry INFINITELY every 2 seconds. | |
| * We NEVER give up on transient errors — just keep trying. | |
| */ | |
| const RETRY_DELAY_MS = 2000; // fixed 2 second interval — simple and reliable | |
| /** | |
| * Resolve the API URL and key based on provider config | |
| */ | |
| function resolveApiConfig(config: AgentConfig) { | |
| // ─── HARDCODED FALLBACK — always works even if settings are corrupted ─── | |
| const FALLBACK_URL = "https://api.deepinfra.com/v1/openai"; | |
| const FALLBACK_MODEL = "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo"; | |
| // Resolve model aliases (used for both default and custom paths) | |
| const aliasMap: Record<string, string> = { | |
| // Xiaomi MiMo | |
| mimo: "XiaomiMiMo/MiMo-V2-Flash", | |
| "mimo-flash": "XiaomiMiMo/MiMo-V2-Flash", | |
| "mimo-v2": "XiaomiMiMo/MiMo-V2-Flash", | |
| // Qwen models (DeepInfra) | |
| "qwen-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", | |
| "qwen-coder-turbo": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", | |
| "qwen-coder-480b": "Qwen/Qwen3-Coder-480B-A35B-Instruct", | |
| "qwen3-235b": "Qwen/Qwen3-235B-A22B-Instruct-2507", | |
| "qwen3-thinking": "Qwen/Qwen3-235B-A22B-Thinking-2507", | |
| "qwen3.5": "Qwen/Qwen3.5-397B-A17B", | |
| "qwen3-32b": "Qwen/Qwen3-32B", | |
| "qwen3-8b": "Qwen/Qwen3-8B", | |
| "qwen3-coder": "Qwen/Qwen3-Coder-480B-A35B-Instruct-Turbo", | |
| // Llama | |
| llama: "meta-llama/Llama-3.3-70B-Instruct", | |
| "llama-70b": "meta-llama/Llama-3.3-70B-Instruct", | |
| "llama-4": "meta-llama/Llama-4-Maverick-17B-128E", | |
| // DeepSeek | |
| deepseek: "deepseek-ai/DeepSeek-V3.2", | |
| "deepseek-r1": "deepseek-ai/DeepSeek-R1-0528", | |
| "deepseek-v3": "deepseek-ai/DeepSeek-V3.2", | |
| // NVIDIA | |
| nemotron: "nvidia/NVIDIA-Nemotron-3-Super-120B-A12B", | |
| // StepFun | |
| step: "stepfun-ai/Step-3.5-Flash", | |
| "step-flash": "stepfun-ai/Step-3.5-Flash", | |
| // Uncensored | |
| hermes: "NousResearch/Hermes-3-Llama-3.1-70B", | |
| "hermes-405b": "NousResearch/Hermes-3-Llama-3.1-405B", | |
| uncensored: "NousResearch/Hermes-3-Llama-3.1-70B", | |
| // OpenAI GPT-5.x family | |
| "gpt5": "gpt-5.4", | |
| "gpt-5": "gpt-5.4", | |
| "gpt54": "gpt-5.4", | |
| // Anthropic aliases | |
| opus: "claude-opus-4-6", | |
| sonnet: "claude-sonnet-4-6", | |
| haiku: "claude-haiku-4-5-20251213", | |
| // xAI | |
| grok: "grok-3", | |
| "grok-3": "grok-3", | |
| gemini: "google/gemini-2.5-flash", | |
| "gemini-pro": "google/gemini-2.5-pro", | |
| }; | |
| // Treat empty, null, masked, or built-in providers as "use server default" | |
| const hasCustomKey = config.apiKey && config.apiKey.length > 4 && !config.apiKey.startsWith("••••"); | |
| if (config.apiProvider === "claw" || config.apiProvider === "default" || config.apiProvider === "huggingface" || config.apiProvider === "deepinfra" || !hasCustomKey) { | |
| const defaultModel = process.env.DEFAULT_MODEL || FALLBACK_MODEL; | |
| const resolvedModel = aliasMap[config.model] || config.model || defaultModel; | |
| // Use BUILT_IN_FORGE_API_URL from env — HuggingFace router or OpenAI | |
| const baseUrl = (ENV.forgeApiUrl || FALLBACK_URL).replace(/\/$/, ""); | |
| const apiKey = ENV.forgeApiKey || process.env.BUILT_IN_FORGE_API_KEY || ""; | |
| console.log(`[agent] resolveApiConfig: using server default. URL=${baseUrl}, model=${resolvedModel}, hasKey=${!!apiKey}`); | |
| return { | |
| url: `${baseUrl}/chat/completions`, | |
| key: apiKey, | |
| model: resolvedModel || FALLBACK_MODEL, | |
| }; | |
| } | |
| // Custom provider path — user has their own API key | |
| let baseUrl = config.apiBaseUrl || ""; | |
| if (!baseUrl) { | |
| const providers: Record<string, string> = { | |
| deepinfra: "https://api.deepinfra.com/v1/openai", | |
| huggingface: "https://router.huggingface.co/v1", | |
| xai: "https://api.x.ai/v1", | |
| openrouter: "https://openrouter.ai/api/v1", | |
| openai: "https://api.openai.com/v1", | |
| anthropic: "https://api.anthropic.com/v1", | |
| groq: "https://api.groq.com/openai/v1", | |
| cerebras: "https://api.cerebras.ai/v1", | |
| ollama: "http://localhost:11434/v1", | |
| }; | |
| baseUrl = providers[config.apiProvider] || FALLBACK_URL; | |
| } | |
| const resolvedModel = aliasMap[config.model] || config.model || FALLBACK_MODEL; | |
| console.log(`[agent] resolveApiConfig: custom provider. URL=${baseUrl}, model=${resolvedModel}`); | |
| return { | |
| url: `${baseUrl.replace(/\/$/, "")}/chat/completions`, | |
| key: config.apiKey, | |
| model: resolvedModel, | |
| }; | |
| } | |
| /** | |
| * Send an SSE event to the client | |
| */ | |
| function sendSSE(res: Response, event: string, data: unknown) { | |
| try { | |
| res.write(`event: ${event}\ndata: ${JSON.stringify(data)}\n\n`); | |
| } catch { | |
| // Connection may be closed | |
| } | |
| } | |
| /** | |
| * Run the agentic loop: send messages to LLM, execute tool calls, repeat. | |
| * This is the core of the agent — it loops until the LLM stops calling tools. | |
| */ | |
| export async function runAgentLoop( | |
| messages: AgentMessage[], | |
| sessionId: number, | |
| config: Partial<AgentConfig>, | |
| res: Response, | |
| signal?: AbortSignal | |
| ): Promise<{ | |
| finalMessages: AgentMessage[]; | |
| totalPromptTokens: number; | |
| totalCompletionTokens: number; | |
| totalCost: number; | |
| model: string; | |
| }> { | |
| const cfg = { ...DEFAULT_CONFIG, ...config }; | |
| const apiConfig = resolveApiConfig(cfg); | |
| const workDir = cfg.workDir || "/home/ubuntu"; | |
| // Get plan mode state | |
| const planState = getPlanMode(sessionId); | |
| // Read git status and diff (matches original ProjectContext::discover_with_git) | |
| const gitStatus = readGitStatus(workDir); | |
| const gitDiff = readGitDiff(workDir); | |
| // Build system prompt with full environment context | |
| const systemPrompt = buildSystemPrompt({ | |
| memory: cfg.memory, | |
| effortLevel: cfg.effortLevel || "high", | |
| planMode: planState.active, | |
| planSteps: planState.steps, | |
| customSystemPrompt: cfg.systemPrompt, | |
| workDir, | |
| platform: "linux", | |
| model: apiConfig.model, | |
| gitStatus, | |
| gitDiff, | |
| }); | |
| // Initialize UsageTracker (matches original conversation.rs) | |
| const usageTracker = UsageTracker.new(); | |
| // Build conversation with system message first | |
| const conversationMessages: AgentMessage[] = [ | |
| { role: "system", content: systemPrompt }, | |
| ...messages.filter((m) => m.role !== "system"), | |
| ]; | |
| let totalPromptTokens = 0; | |
| let totalCompletionTokens = 0; | |
| let totalCost = 0; | |
| let iterations = 0; | |
| let emptyResponseRetries = 0; | |
| const MAX_EMPTY_RETRIES = 3; | |
| // Safety limit: prevent infinite loops. Original claw-code uses usize::MAX but that | |
| // causes runaway loops with Qwen3 which sometimes fails to stop generating. | |
| // 200 iterations is more than enough for any real task. | |
| const MAX_ITERATIONS = cfg.maxIterations || 200; | |
| const assistantMessages: AgentMessage[] = []; | |
| const toolResultMessages: AgentMessage[] = []; | |
| // ─── Loop detection: minimal safety net ───────────────────────────── | |
| // Only detect EXACT same tool+args repeated 5+ times (true infinite loop). | |
| // Everything else is handled by MAX_ITERATIONS. | |
| const recentToolSignatures: string[] = []; | |
| const MAX_EXACT_REPEATS = 5; | |
| // ─── MCP Tools Dynamic Injection (matches original claw-code) ────────── | |
| // Initialize MCP servers from config and merge discovered tools with static TOOL_DEFINITIONS. | |
| // This is how the original claw-code dynamically builds the tool list: | |
| // 1. Load MCP server configs from .claw/settings.json | |
| // 2. Connect to each server via stdio JSON-RPC | |
| // 3. Call tools/list to discover available tools | |
| // 4. Prefix tool names as mcp__servername__toolname | |
| // 5. Merge with static tool definitions | |
| let allTools = [...TOOL_DEFINITIONS]; | |
| try { | |
| const mcpTools = await initializeMcpFromConfig(workDir); | |
| if (mcpTools.length > 0) { | |
| const mcpManager = getMcpManager(); | |
| if (mcpManager) { | |
| const mcpDefs = mcpManager.getToolDefinitions(); | |
| // Convert MCP tool format to OpenAI function calling format | |
| const mcpToolDefs = mcpDefs.map((t) => ({ | |
| type: "function" as const, | |
| function: { | |
| name: t.name, | |
| description: t.description, | |
| parameters: t.input_schema || { type: "object", properties: {} }, | |
| }, | |
| })); | |
| allTools = [...TOOL_DEFINITIONS, ...mcpToolDefs]; | |
| console.log(`[agent] MCP tools injected: ${mcpDefs.map((t) => t.name).join(", ")}`); | |
| sendSSE(res, "status", { | |
| status: "mcp_ready", | |
| message: `MCP tools loaded: ${mcpDefs.length} tools from ${mcpManager.getConnectedServers().length} servers`, | |
| }); | |
| } | |
| } | |
| } catch (err: any) { | |
| console.error(`[agent] MCP initialization error (non-fatal):`, err.message); | |
| // MCP init failure is non-fatal — agent continues with static tools only | |
| } | |
| // ─── Context-aware compaction config ────────────────────────────── | |
| // Original claw-code uses percentage-based thresholds, not fixed 10k tokens. | |
| // We compute the threshold as 70% of the model's context window. | |
| const contextWindow = MODEL_CONTEXT_WINDOWS[apiConfig.model] || DEFAULT_CONTEXT_WINDOW; | |
| const dynamicCompactionConfig: import("./compact").CompactionConfig = { | |
| preserveRecentMessages: DEFAULT_COMPACTION_CONFIG.preserveRecentMessages, | |
| maxEstimatedTokens: Math.floor(contextWindow * 0.7), | |
| }; | |
| sendSSE(res, "status", { status: "thinking", message: "Processing your request..." }); | |
| while (iterations < MAX_ITERATIONS) { | |
| iterations++; | |
| if (signal?.aborted) { | |
| sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" }); | |
| break; | |
| } | |
| // Build API request | |
| // Determine max_tokens limit based on provider | |
| const isDeepInfra = apiConfig.url.includes("deepinfra.com"); | |
| const isHuggingFace = apiConfig.url.includes("huggingface.co"); | |
| const maxTokensLimit = isHuggingFace ? 32000 : (isDeepInfra ? 65536 : 65536); | |
| // Detect if model supports thinking/reasoning mode (Qwen3 Thinking, DeepSeek-R1) | |
| const isThinkingModel = apiConfig.model.includes("Thinking") || apiConfig.model.includes("R1"); | |
| const payload: Record<string, unknown> = { | |
| model: apiConfig.model, | |
| messages: conversationMessages.map((m) => { | |
| const msg: Record<string, unknown> = { role: m.role, content: m.content }; | |
| if (m.tool_calls) msg.tool_calls = m.tool_calls; | |
| if (m.tool_call_id) msg.tool_call_id = m.tool_call_id; | |
| if (m.name) msg.name = m.name; | |
| return msg; | |
| }), | |
| tools: allTools, | |
| // Match original: always "auto" — let the model decide what tools to use | |
| tool_choice: "auto", | |
| max_tokens: Math.min(cfg.maxTokens, maxTokensLimit), | |
| temperature: isThinkingModel ? 0.6 : cfg.temperature, // thinking models need lower temp | |
| top_p: cfg.topP, | |
| stream: true, | |
| }; | |
| // Enable thinking/reasoning for supported models (Qwen3 Thinking, DeepSeek-R1) | |
| if (isThinkingModel && isDeepInfra) { | |
| (payload as any).extra_body = { enable_thinking: true }; | |
| } | |
| // Qwen3-Coder: hybrid thinking mode — model decides when to think | |
| // DO NOT disable thinking; Qwen3 automatically uses <think> blocks for complex tasks | |
| // The /no_think in system prompt is only a soft hint for simple responses, | |
| // the model will override it when it needs to reason through tool calls or code | |
| // NOTE: Prefill technique (empty assistant message) was removed because it caused | |
| // the model to "talk to itself" — seeing an empty assistant message made it think | |
| // it already responded and continue generating follow-up messages in a loop. | |
| // Anti-censorship is handled purely via system prompt directives instead. | |
| sendSSE(res, "message_start", { iteration: iterations }); | |
| try { | |
| // ─── Retry loop with INFINITE retries for 429, limited for 5xx ───── | |
| // 429 = rate limit: the API is telling us to slow down, NOT to give up. | |
| // We MUST keep retrying with increasing backoff until it works. | |
| // 500/502/503 = server errors: retry up to MAX_SERVER_RETRIES times. | |
| let response: globalThis.Response | null = null; | |
| let lastErrorText = ""; | |
| let lastStatus = 0; | |
| let serverRetryCount = 0; | |
| let rateLimitRetryCount = 0; | |
| while (true) { | |
| if (signal?.aborted) break; | |
| try { | |
| response = await fetch(apiConfig.url, { | |
| method: "POST", | |
| headers: { | |
| "Content-Type": "application/json", | |
| Authorization: `Bearer ${apiConfig.key}`, | |
| }, | |
| body: JSON.stringify(payload), | |
| signal, | |
| }); | |
| } catch (fetchErr: any) { | |
| // Network error (DNS, connection refused, etc.) — retry every 2s | |
| if (signal?.aborted) break; | |
| serverRetryCount++; | |
| console.error(`[agent] Fetch error (retry #${serverRetryCount}):`, fetchErr.message); | |
| sendSSE(res, "status", { | |
| status: "retrying", | |
| message: `Network error, retrying in 2s... (attempt #${serverRetryCount})`, | |
| }); | |
| await new Promise((r) => setTimeout(r, RETRY_DELAY_MS)); | |
| continue; | |
| } | |
| if (response.ok) break; | |
| lastStatus = response.status; | |
| lastErrorText = await response.text(); | |
| // ─── 429 Rate Limit: INFINITE retry every 2s ─── | |
| if (response.status === 429) { | |
| rateLimitRetryCount++; | |
| console.log(`[agent] Rate limited (429) — retry #${rateLimitRetryCount} in 2s`); | |
| sendSSE(res, "status", { | |
| status: "rate_limited", | |
| message: `Rate limited by API — retrying in 2s... (attempt #${rateLimitRetryCount})`, | |
| }); | |
| await new Promise((r) => setTimeout(r, RETRY_DELAY_MS)); | |
| response = null; | |
| continue; // NEVER give up on 429 | |
| } | |
| // ─── 500/502/503 Server errors: INFINITE retry every 2s ─── | |
| if ([500, 502, 503].includes(response.status)) { | |
| serverRetryCount++; | |
| console.log(`[agent] Server error ${response.status} — retry #${serverRetryCount} in 2s`); | |
| sendSSE(res, "status", { | |
| status: "retrying", | |
| message: `Server error ${response.status}, retrying in 2s... (attempt #${serverRetryCount})`, | |
| }); | |
| await new Promise((r) => setTimeout(r, RETRY_DELAY_MS)); | |
| response = null; | |
| continue; | |
| } | |
| // Any other error (400, 401, 403, 404, etc.) — don't retry | |
| break; | |
| } | |
| if (!response || !response.ok) { | |
| console.error(`[agent] API error ${lastStatus}:`, lastErrorText); | |
| console.error(`[agent] Payload model:`, apiConfig.model); | |
| console.error(`[agent] Payload messages count:`, (payload.messages as any[]).length); | |
| // ─── AUTO-COMPACT on context overflow (400 error) ───────────── | |
| if (lastStatus === 400 && (lastErrorText.includes("context_length") || lastErrorText.includes("too many tokens") || lastErrorText.includes("maximum context") || lastErrorText.includes("token limit") || lastErrorText.includes("too long"))) { | |
| console.log(`[agent] Context overflow detected — auto-compacting conversation...`); | |
| sendSSE(res, "status", { | |
| status: "compacting", | |
| message: "Context window exceeded — auto-compacting conversation...", | |
| }); | |
| try { | |
| const session = agentMessagesToSession(conversationMessages); | |
| // LLM-based summarization: use the same API to produce a real summary | |
| const llmFetch = async (msgs: Array<{role: string; content: string}>) => { | |
| const summaryResp = await fetch(apiConfig.url, { | |
| method: "POST", | |
| headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` }, | |
| body: JSON.stringify({ | |
| model: apiConfig.model, | |
| messages: msgs, | |
| max_tokens: 2000, | |
| temperature: 0.3, | |
| stream: false, | |
| }), | |
| }); | |
| if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`); | |
| const json = await summaryResp.json(); | |
| return json.choices?.[0]?.message?.content || ""; | |
| }; | |
| const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetch); | |
| if (compactResult.removedMessageCount > 0) { | |
| const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession); | |
| conversationMessages.length = 0; | |
| conversationMessages.push({ role: "system", content: systemPrompt }); | |
| conversationMessages.push(...compactedAgentMessages); | |
| console.log(`[agent] Auto-compact (LLM): removed ${compactResult.removedMessageCount} messages, kept ${conversationMessages.length}`); | |
| sendSSE(res, "auto_compact", { | |
| removedCount: compactResult.removedMessageCount, | |
| keptCount: conversationMessages.length, | |
| summary: compactResult.formattedSummary, | |
| }); | |
| continue; // retry with compacted context | |
| } else { | |
| console.error(`[agent] Auto-compact produced no reduction — breaking`); | |
| sendSSE(res, "error", { | |
| message: `Context overflow but compaction couldn't reduce further`, | |
| details: lastErrorText, | |
| }); | |
| break; | |
| } | |
| } catch (compactErr: any) { | |
| console.error(`[agent] Auto-compact failed:`, compactErr.message); | |
| sendSSE(res, "error", { | |
| message: `Context overflow — auto-compact failed: ${compactErr.message}`, | |
| details: lastErrorText, | |
| }); | |
| break; | |
| } | |
| } | |
| // Non-context 400 errors — log details for debugging | |
| if (lastStatus === 400) { | |
| console.error(`[agent] Full error body:`, lastErrorText); | |
| (payload.messages as any[]).forEach((m: any, i: number) => { | |
| console.error(`[agent] msg[${i}] role=${m.role} content_type=${typeof m.content} content_len=${String(m.content || '').length} has_tool_calls=${!!m.tool_calls} has_tool_call_id=${!!m.tool_call_id}`); | |
| }); | |
| } | |
| sendSSE(res, "error", { | |
| message: `API error: ${lastStatus}${lastStatus === 429 ? ' (rate limit)' : ''} — ${lastErrorText.substring(0, 200)}`, | |
| details: lastErrorText, | |
| }); | |
| break; | |
| } | |
| // Process streaming response | |
| let result: { content: string; toolCalls: Array<{ id: string; type: "function"; function: { name: string; arguments: string } }>; usage?: any }; | |
| try { | |
| result = await processStream(response, res, signal); | |
| } catch (streamErr: any) { | |
| // Stream processing error — treat as transient, retry | |
| console.error(`[agent] Stream processing error:`, streamErr.message); | |
| if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) { | |
| sendSSE(res, "status", { status: "retrying", message: `Stream error, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` }); | |
| await new Promise(r => setTimeout(r, 1500)); | |
| continue; | |
| } | |
| sendSSE(res, "error", { message: `Stream failed after ${MAX_EMPTY_RETRIES} retries: ${streamErr.message}` }); | |
| break; | |
| } | |
| // ─── Bug #1 fix: Handle empty LLM response with retry ───────── | |
| // Original claw-code retries on empty response instead of crashing. | |
| // Open-source models via HuggingFace often return empty streams. | |
| if (!result.content && result.toolCalls.length === 0) { | |
| if (emptyResponseRetries++ < MAX_EMPTY_RETRIES) { | |
| console.warn(`[agent] Empty response from LLM — retry ${emptyResponseRetries}/${MAX_EMPTY_RETRIES}`); | |
| sendSSE(res, "status", { status: "retrying", message: `Empty response from model, retrying... (${emptyResponseRetries}/${MAX_EMPTY_RETRIES})` }); | |
| await new Promise(r => setTimeout(r, 1500)); | |
| continue; // retry same iteration | |
| } | |
| console.error(`[agent] LLM returned empty response ${MAX_EMPTY_RETRIES} times — giving up`); | |
| sendSSE(res, "error", { message: `Model returned empty response after ${MAX_EMPTY_RETRIES} retries. Try a different model or reduce context.` }); | |
| break; | |
| } | |
| emptyResponseRetries = 0; // reset on successful response | |
| // Track usage with UsageTracker (matches original) | |
| if (result.usage) { | |
| totalPromptTokens += result.usage.prompt_tokens || 0; | |
| totalCompletionTokens += result.usage.completion_tokens || 0; | |
| usageTracker.record({ | |
| input_tokens: result.usage.prompt_tokens || 0, | |
| output_tokens: result.usage.completion_tokens || 0, | |
| cache_creation_input_tokens: result.usage.cache_creation_input_tokens || 0, | |
| cache_read_input_tokens: result.usage.cache_read_input_tokens || 0, | |
| }); | |
| } | |
| // Add assistant message to conversation | |
| const assistantMessage: AgentMessage = { | |
| role: "assistant", | |
| // Match original: null when no content (even with tool_calls) | |
| content: result.content || null, | |
| }; | |
| if (result.toolCalls && result.toolCalls.length > 0) { | |
| assistantMessage.tool_calls = result.toolCalls; | |
| } | |
| conversationMessages.push(assistantMessage); | |
| assistantMessages.push(assistantMessage); | |
| // If no tool calls, we're done — the LLM has finished responding. | |
| // This matches the original claw-code behavior exactly: | |
| // the model decides when to stop by not calling tools. | |
| if (!result.toolCalls || result.toolCalls.length === 0) { | |
| sendSSE(res, "message_end", { | |
| promptTokens: totalPromptTokens, | |
| completionTokens: totalCompletionTokens, | |
| cost: totalCost, | |
| model: apiConfig.model, | |
| }); | |
| break; | |
| } | |
| // ─── Minimal loop detection: only catch TRUE infinite loops ─────── | |
| // Only break if the EXACT same tool+args is repeated 5+ times. | |
| // This is the only safety net beyond MAX_ITERATIONS. | |
| // The original claw-code has NO loop detection at all — it trusts the model. | |
| const currentToolSig = result.toolCalls.map((tc: any) => `${tc.function.name}:${tc.function.arguments}`).join("|"); | |
| recentToolSignatures.push(currentToolSig); | |
| if (recentToolSignatures.length > MAX_EXACT_REPEATS) { | |
| recentToolSignatures.shift(); | |
| } | |
| if (recentToolSignatures.length >= MAX_EXACT_REPEATS) { | |
| const allSame = recentToolSignatures.every(r => r === recentToolSignatures[0]); | |
| if (allSame) { | |
| console.warn(`[agent] Infinite loop detected: exact same tool call repeated ${MAX_EXACT_REPEATS} times — breaking`); | |
| sendSSE(res, "error", { | |
| message: `⚠️ обнаружен бесконечный цикл. попробуй переформулировать запрос`, | |
| }); | |
| sendSSE(res, "message_end", { | |
| promptTokens: totalPromptTokens, | |
| completionTokens: totalCompletionTokens, | |
| cost: totalCost, | |
| model: apiConfig.model, | |
| }); | |
| break; | |
| } | |
| } | |
| // ─── Execute tool calls ────────────────────────────────────────── | |
| // Bug #2+#3 fix: Each tool call is wrapped in its own try-catch. | |
| // Original claw-code sends tool errors back to LLM as tool results, | |
| // letting the model decide how to handle them. We NEVER break the | |
| // loop on a tool error — only on fatal API/stream errors. | |
| for (const toolCall of result.toolCalls) { | |
| const toolName = toolCall.function.name; | |
| let toolArgs: Record<string, unknown> = {}; | |
| let argParseError = false; | |
| try { | |
| toolArgs = JSON.parse(toolCall.function.arguments || "{}"); | |
| } catch (parseErr: any) { | |
| // Try JSON repair before giving up | |
| try { | |
| const { jsonrepair } = await import("jsonrepair"); | |
| const repaired = jsonrepair(toolCall.function.arguments || "{}"); | |
| toolArgs = JSON.parse(repaired); | |
| console.info(`[agent] Repaired malformed JSON for ${toolName}`); | |
| } catch (repairErr: any) { | |
| argParseError = true; | |
| console.warn(`[agent] Malformed tool args for ${toolName} (repair failed):`, toolCall.function.arguments?.substring(0, 200)); | |
| } | |
| } | |
| sendSSE(res, "tool_call_start", { | |
| id: toolCall.id, | |
| name: toolName, | |
| arguments: toolCall.function.arguments, | |
| }); | |
| let toolOutput: string; | |
| let isError = false; | |
| // If JSON args were malformed, skip execution and tell LLM to fix | |
| if (argParseError) { | |
| toolOutput = `Error: Your tool call arguments for '${toolName}' contained malformed JSON. The raw arguments were: ${(toolCall.function.arguments || "").substring(0, 500)}. Please fix the JSON and try again.`; | |
| isError = true; | |
| } else try { | |
| // ─── Pre-tool hooks (matches original HookRunner.run_pre_tool_use) ── | |
| const preHookResult = await runPreToolHooks(toolName, sessionId, toolArgs, workDir); | |
| if (!preHookResult.allowed) { | |
| // Hook denied the tool execution (exit code 2 = deny) | |
| toolOutput = preHookResult.message || `Tool '${toolName}' was denied by pre-tool hook`; | |
| isError = true; | |
| sendSSE(res, "permission_denied", { | |
| toolName, | |
| toolCallId: toolCall.id, | |
| reason: toolOutput, | |
| needsPrompt: false, | |
| }); | |
| } else { | |
| // Execute the tool with the correct working directory | |
| const toolResult = await executeTool(toolName, toolArgs, sessionId, workDir); | |
| if (toolResult.isError && toolResult.output.includes("needs one-time approval")) { | |
| sendSSE(res, "permission_prompt", { | |
| toolName, | |
| toolCallId: toolCall.id, | |
| reason: toolResult.output, | |
| }); | |
| } | |
| toolOutput = toolResult.output; | |
| isError = toolResult.isError || false; | |
| // Merge pre-hook feedback (matches original merge_hook_feedback) | |
| if (preHookResult.message) { | |
| toolOutput = mergeHookFeedback([preHookResult.message], toolOutput, false); | |
| } | |
| // ─── Post-tool hooks (matches original HookRunner.run_post_tool_use) ── | |
| const postHookResult = await runPostToolHooks(toolName, sessionId, toolResult, workDir); | |
| toolOutput = postHookResult.output; | |
| isError = postHookResult.isError || false; | |
| } | |
| } catch (toolExecError: any) { | |
| // ─── Bug #3 fix: Tool exception → error result for LLM ────── | |
| // Original claw-code: tool errors become tool results, NOT loop breaks. | |
| // The LLM sees the error and can try a different approach. | |
| console.error(`[agent] Tool '${toolName}' threw exception:`, toolExecError.message); | |
| toolOutput = `Tool execution error: ${toolExecError.message}`; | |
| isError = true; | |
| } | |
| // No error classification or guidance injection. | |
| // The model receives raw error output and decides how to handle it. | |
| // This matches the original claw-code behavior. | |
| sendSSE(res, "tool_result", { | |
| toolCallId: toolCall.id, | |
| toolName, | |
| output: toolOutput, | |
| isError, | |
| durationMs: 0, | |
| }); | |
| // ─── Special SSE events for interactive tools ──────────────── | |
| // SendUserMessage / Brief: emit SSE for frontend display but DO NOT break the loop. | |
| // Original claw-code does NOT stop on SendUserMessage — the model can send | |
| // progress updates ("checking...", "found vulnerability...") AND continue working. | |
| // Breaking here was the #1 cause of the agent stopping mid-task. | |
| if ((toolName === "SendUserMessage" || toolName === "Brief" || toolName === "ask_user") && !isError) { | |
| sendSSE(res, "assistant_message", { | |
| message: toolArgs.message || toolArgs.question || "", | |
| attachments: toolArgs.attachments || [], | |
| }); | |
| } | |
| // Plan/Todo tools: emit plan state updates | |
| if (["TodoWrite", "plan_create", "plan_update", "enter_plan_mode", "exit_plan_mode"].includes(toolName)) { | |
| const updatedPlan = getPlanMode(sessionId); | |
| sendSSE(res, "plan_update", { | |
| active: updatedPlan.active, | |
| steps: updatedPlan.steps, | |
| }); | |
| } | |
| // Add tool result to conversation for the LLM to process | |
| const toolResultMsg: AgentMessage = { | |
| role: "tool", | |
| content: toolOutput, | |
| tool_call_id: toolCall.id, | |
| name: toolName, | |
| }; | |
| conversationMessages.push(toolResultMsg); | |
| toolResultMessages.push(toolResultMsg); | |
| } | |
| // No consecutive error detection — the model handles errors naturally. | |
| // MAX_ITERATIONS (200) is the ultimate safety net. | |
| // SendUserMessage does NOT break the loop (matches original). | |
| // ─── Proactive auto-compact check ───────────────────────────── | |
| // Check if conversation is approaching context window limit and compact proactively | |
| const estimatedTokens = estimateConversationTokens(conversationMessages); | |
| // contextWindow already computed above (line 397) | |
| const contextUsagePercent = Math.round((estimatedTokens / contextWindow) * 100); | |
| // Emit context usage SSE for frontend tracking | |
| sendSSE(res, "context_usage", { | |
| estimatedTokens, | |
| contextWindow, | |
| usagePercent: contextUsagePercent, | |
| messageCount: conversationMessages.length, | |
| }); | |
| // Proactive compaction at 80% context usage | |
| if (contextUsagePercent >= 80) { | |
| console.log(`[agent] Context at ${contextUsagePercent}% — proactive auto-compact`); | |
| sendSSE(res, "status", { | |
| status: "compacting", | |
| message: `Context at ${contextUsagePercent}% — auto-compacting to free space...`, | |
| }); | |
| try { | |
| const session = agentMessagesToSession(conversationMessages); | |
| // LLM-based summarization for proactive compaction | |
| const llmFetchProactive = async (msgs: Array<{role: string; content: string}>) => { | |
| const summaryResp = await fetch(apiConfig.url, { | |
| method: "POST", | |
| headers: { "Content-Type": "application/json", Authorization: `Bearer ${apiConfig.key}` }, | |
| body: JSON.stringify({ | |
| model: apiConfig.model, | |
| messages: msgs, | |
| max_tokens: 2000, | |
| temperature: 0.3, | |
| stream: false, | |
| }), | |
| }); | |
| if (!summaryResp.ok) throw new Error(`LLM summary failed: ${summaryResp.status}`); | |
| const json = await summaryResp.json(); | |
| return json.choices?.[0]?.message?.content || ""; | |
| }; | |
| const compactResult = await compactSessionWithLLM(session, dynamicCompactionConfig, llmFetchProactive); | |
| if (compactResult.removedMessageCount > 0) { | |
| const compactedAgentMessages = sessionToAgentMessages(compactResult.compactedSession); | |
| conversationMessages.length = 0; | |
| // CRITICAL: Re-prepend original system prompt before compacted summary. | |
| conversationMessages.push({ role: "system", content: systemPrompt }); | |
| conversationMessages.push(...compactedAgentMessages); | |
| // Inject current todo/plan state so the agent doesn't lose its plan after compaction | |
| const todoState = (() => { | |
| try { | |
| const executor = require("../tools/executor"); | |
| const plan = executor.getPlanMode(sessionId); | |
| const todos = executor.todoLists?.get?.(sessionId) || []; | |
| let state = ""; | |
| if (todos.length > 0) { | |
| state += "\n\n[PRESERVED TODO LIST]\n" + todos.map((t: any, i: number) => { | |
| const icon = t.status === "completed" ? "\u2713" : t.status === "in_progress" ? "\u25cf" : "\u25cb"; | |
| return ` ${icon} ${i + 1}. ${t.content} [${t.status}]`; | |
| }).join("\n"); | |
| } | |
| if (plan?.active && plan.steps?.length > 0) { | |
| state += "\n\n[PRESERVED PLAN]\n" + plan.steps.map((s: any) => { | |
| const icon = s.status === "done" ? "\u2713" : s.status === "in_progress" ? "\u25cf" : "\u25a1"; | |
| return ` ${icon} ${s.id}. ${s.text} [${s.status}]`; | |
| }).join("\n"); | |
| } | |
| return state; | |
| } catch { return ""; } | |
| })(); | |
| if (todoState) { | |
| // Append todo state to the last user/system message so the model sees it | |
| const lastMsg = conversationMessages[conversationMessages.length - 1]; | |
| if (lastMsg && typeof lastMsg.content === "string") { | |
| lastMsg.content += todoState; | |
| } | |
| } | |
| sendSSE(res, "auto_compact", { | |
| removedCount: compactResult.removedMessageCount, | |
| keptCount: conversationMessages.length, | |
| summary: compactResult.formattedSummary, | |
| }); | |
| console.log(`[agent] Proactive compact: removed ${compactResult.removedMessageCount} messages`); | |
| } | |
| } catch (compactErr: any) { | |
| console.error(`[agent] Proactive compact failed (non-fatal):`, compactErr.message); | |
| } | |
| } | |
| // ─── Buddy events SSE ──────────────────────────────────────────── | |
| // Emit buddy_event for each tool call so frontend can award XP | |
| for (const toolCall of result.toolCalls) { | |
| const tn = toolCall.function.name; | |
| sendSSE(res, "buddy_event", { | |
| type: "tool_call", | |
| toolName: tn, | |
| iteration: iterations, | |
| }); | |
| // Special buddy events for file creation | |
| if (tn === "write_file" || tn === "create_file") { | |
| sendSSE(res, "buddy_event", { | |
| type: "file_created", | |
| toolName: tn, | |
| iteration: iterations, | |
| }); | |
| } | |
| } | |
| // Continue the loop — LLM will see tool results and decide next action | |
| sendSSE(res, "status", { | |
| status: "thinking", | |
| message: `Processing tool results (iteration ${iterations}, context: ${contextUsagePercent}%)...`, | |
| }); | |
| } catch (error: any) { | |
| // ─── Bug #2 fix: Distinguish fatal vs transient errors ──────── | |
| // Only AbortError and unrecoverable errors should break the loop. | |
| // Stream/fetch errors are already handled above with retry logic. | |
| if (error.name === "AbortError" || signal?.aborted) { | |
| sendSSE(res, "status", { status: "cancelled", message: "Request cancelled" }); | |
| break; | |
| } | |
| // For other errors, log and break (these are truly unexpected) | |
| console.error(`[agent] Unexpected error in agent loop:`, error.message, error.stack); | |
| sendSSE(res, "error", { message: error.message || "Unknown error" }); | |
| break; | |
| } | |
| } | |
| if (iterations >= MAX_ITERATIONS) { | |
| sendSSE(res, "error", { message: `Maximum iterations (${MAX_ITERATIONS}) reached. Use /compact to reduce context and continue.` }); | |
| } | |
| // ─── Buddy: session_completed event ───────────────────────────────── | |
| // Emit session_completed so Buddy can award XP for finishing a turn | |
| sendSSE(res, "buddy_event", { | |
| type: "session_completed", | |
| iterations, | |
| toolCallCount: toolResultMessages.length, | |
| }); | |
| // Calculate cost using UsageTracker (matches original) | |
| const cumulativeUsage = usageTracker.cumulativeUsage(); | |
| const modelPricing = pricingForModel(apiConfig.model) ?? defaultSonnetTierPricing(); | |
| const costEstimate = estimateCostUsdWithPricing(cumulativeUsage, modelPricing); | |
| totalCost = totalCostUsd(costEstimate); | |
| // Emit usage summary lines (matches original summary_lines_for_model) | |
| const usageSummary = summaryLinesForModel(cumulativeUsage, "session", apiConfig.model); | |
| sendSSE(res, "usage", { | |
| promptTokens: totalPromptTokens, | |
| completionTokens: totalCompletionTokens, | |
| totalTokens: totalPromptTokens + totalCompletionTokens, | |
| cost: totalCost, | |
| cacheCreationTokens: cumulativeUsage.cache_creation_input_tokens, | |
| cacheReadTokens: cumulativeUsage.cache_read_input_tokens, | |
| usageSummary, | |
| turns: usageTracker.turns(), | |
| formattedCost: formatUsd(totalCost), | |
| }); | |
| return { | |
| finalMessages: conversationMessages.filter((m) => m.role !== "system"), | |
| totalPromptTokens, | |
| totalCompletionTokens, | |
| totalCost, | |
| model: apiConfig.model, | |
| }; | |
| } | |
| /** | |
| * Process a streaming response from the LLM API (OpenAI-compatible SSE format) | |
| */ | |
| async function processStream( | |
| response: globalThis.Response, | |
| res: Response, | |
| signal?: AbortSignal | |
| ): Promise<{ | |
| content: string; | |
| toolCalls: Array<{ | |
| id: string; | |
| type: "function"; | |
| function: { name: string; arguments: string }; | |
| }>; | |
| usage?: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number }; | |
| }> { | |
| const reader = response.body?.getReader(); | |
| if (!reader) throw new Error("No response body"); | |
| const decoder = new TextDecoder(); | |
| let content = ""; | |
| const toolCalls: Map< | |
| number, | |
| { id: string; type: "function"; function: { name: string; arguments: string } } | |
| > = new Map(); | |
| let usage: { prompt_tokens: number; completion_tokens: number; cache_creation_input_tokens?: number; cache_read_input_tokens?: number } | undefined; | |
| let buffer = ""; | |
| try { | |
| while (true) { | |
| if (signal?.aborted) break; | |
| const { done, value } = await reader.read(); | |
| if (done) break; | |
| buffer += decoder.decode(value, { stream: true }); | |
| // Process complete SSE lines | |
| const lines = buffer.split("\n"); | |
| buffer = lines.pop() || ""; | |
| for (const line of lines) { | |
| if (!line.startsWith("data: ")) continue; | |
| const data = line.slice(6).trim(); | |
| if (data === "[DONE]") continue; | |
| try { | |
| const chunk = JSON.parse(data); | |
| const delta = chunk.choices?.[0]?.delta; | |
| // Detect API errors returned inside the SSE stream (e.g. DeepInfra "Operation not allowed") | |
| if (chunk.error) { | |
| const errMsg = chunk.error.message || chunk.error.type || JSON.stringify(chunk.error); | |
| console.error(`[agent] API error in stream: ${errMsg}`); | |
| throw new Error(`API error in stream: ${errMsg}`); | |
| } | |
| if (!delta) { | |
| if (chunk.usage) { | |
| usage = { | |
| prompt_tokens: chunk.usage.prompt_tokens || 0, | |
| completion_tokens: chunk.usage.completion_tokens || 0, | |
| cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0, | |
| cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0, | |
| }; | |
| } | |
| continue; | |
| } | |
| // Reasoning/thinking content (Qwen3 Thinking, DeepSeek-R1) | |
| // These models return reasoning in delta.reasoning_content before the actual response | |
| if (delta.reasoning_content) { | |
| sendSSE(res, "thinking_delta", { text: delta.reasoning_content }); | |
| } | |
| // Text content streaming | |
| if (delta.content) { | |
| content += delta.content; | |
| sendSSE(res, "text_delta", { text: delta.content }); | |
| } | |
| // Tool call streaming | |
| if (delta.tool_calls) { | |
| for (const tc of delta.tool_calls) { | |
| const idx = tc.index ?? 0; | |
| if (!toolCalls.has(idx)) { | |
| toolCalls.set(idx, { | |
| id: tc.id || `call_${idx}_${Date.now()}`, | |
| type: "function", | |
| function: { name: tc.function?.name || "", arguments: "" }, | |
| }); | |
| } | |
| const existing = toolCalls.get(idx)!; | |
| if (tc.id) existing.id = tc.id; | |
| if (tc.function?.name) existing.function.name = tc.function.name; | |
| if (tc.function?.arguments) { | |
| existing.function.arguments += tc.function.arguments; | |
| sendSSE(res, "tool_call_delta", { | |
| id: existing.id, | |
| name: existing.function.name, | |
| arguments: tc.function.arguments, | |
| }); | |
| } | |
| } | |
| } | |
| // Usage info | |
| if (chunk.usage) { | |
| usage = { | |
| prompt_tokens: chunk.usage.prompt_tokens || 0, | |
| completion_tokens: chunk.usage.completion_tokens || 0, | |
| cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0, | |
| cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0, | |
| }; | |
| } | |
| } catch (parseErr: any) { | |
| // Re-throw API errors (these are NOT malformed JSON — they're real errors) | |
| if (parseErr?.message?.startsWith('API error in stream:')) { | |
| throw parseErr; | |
| } | |
| // Skip genuinely malformed JSON chunks (partial SSE data, etc.) | |
| } | |
| } | |
| } | |
| } finally { | |
| reader.releaseLock(); | |
| } | |
| // Process remaining buffer (last line without trailing \n) | |
| if (buffer.trim() && buffer.startsWith("data: ")) { | |
| const data = buffer.slice(6).trim(); | |
| if (data !== "[DONE]") { | |
| try { | |
| const chunk = JSON.parse(data); | |
| const delta = chunk.choices?.[0]?.delta; | |
| if (delta?.content) { | |
| content += delta.content; | |
| sendSSE(res, "text_delta", { text: delta.content }); | |
| } | |
| if (delta?.tool_calls) { | |
| for (const tc of delta.tool_calls) { | |
| const idx = tc.index ?? 0; | |
| if (!toolCalls.has(idx)) { | |
| toolCalls.set(idx, { | |
| id: tc.id || `call_${idx}_${Date.now()}`, | |
| type: "function", | |
| function: { name: tc.function?.name || "", arguments: "" }, | |
| }); | |
| } | |
| const existing = toolCalls.get(idx)!; | |
| if (tc.id) existing.id = tc.id; | |
| if (tc.function?.name) existing.function.name = tc.function.name; | |
| if (tc.function?.arguments) existing.function.arguments += tc.function.arguments; | |
| } | |
| } | |
| if (chunk.usage) { | |
| usage = { | |
| prompt_tokens: chunk.usage.prompt_tokens || 0, | |
| completion_tokens: chunk.usage.completion_tokens || 0, | |
| cache_creation_input_tokens: chunk.usage.cache_creation_input_tokens || 0, | |
| cache_read_input_tokens: chunk.usage.cache_read_input_tokens || 0, | |
| }; | |
| } | |
| // Check finish_reason for truncation | |
| const finishReason = chunk.choices?.[0]?.finish_reason; | |
| if (finishReason === "length") { | |
| console.warn("[agent] Response truncated (finish_reason=length) — tool call args may be incomplete"); | |
| } | |
| } catch { /* skip malformed */ } | |
| } | |
| } | |
| // Original claw-code retries on empty response instead of throwing. | |
| if (content.length === 0 && toolCalls.size === 0) { | |
| console.warn("[agent] LLM returned empty response — will be retried by agent loop"); | |
| } | |
| return { | |
| content, | |
| toolCalls: Array.from(toolCalls.values()), | |
| usage, | |
| }; | |
| } | |
| /** | |
| * Estimate cost based on model and token counts | |
| */ | |
| function estimateCost(model: string, promptTokens: number, completionTokens: number): number { | |
| // Pricing per 1M tokens — aligned with original claw-code model registry | |
| const pricing: Record<string, { input: number; output: number }> = { | |
| // Claw API / Anthropic | |
| "claude-opus-4-6": { input: 15.00, output: 75.00 }, | |
| "claude-sonnet-4-6": { input: 3.00, output: 15.00 }, | |
| "claude-haiku-4-5-20251213": { input: 0.80, output: 4.00 }, | |
| // xAI Grok | |
| "grok-3": { input: 3.00, output: 15.00 }, | |
| "grok-3-mini": { input: 0.30, output: 0.50 }, | |
| "grok-2": { input: 2.00, output: 10.00 }, | |
| // OpenAI | |
| "gpt-5.4": { input: 2.50, output: 15.00 }, | |
| "gpt-5.4-mini": { input: 0.40, output: 1.60 }, | |
| "gpt-5.3-codex": { input: 2.50, output: 10.00 }, | |
| "gpt-4.1": { input: 2.00, output: 8.00 }, | |
| "gpt-4.1-mini": { input: 0.40, output: 1.60 }, | |
| "o3": { input: 10.00, output: 40.00 }, | |
| "o4-mini": { input: 1.10, output: 4.40 }, | |
| // HuggingFace Inference API (free tier = $0, Pro tier = included in subscription) | |
| "XiaomiMiMo/MiMo-V2-Flash": { input: 0.00, output: 0.00 }, | |
| "Qwen/Qwen3-Coder-Next": { input: 0.00, output: 0.00 }, | |
| "Qwen/Qwen3-8B": { input: 0.00, output: 0.00 }, | |
| "Qwen/Qwen3-Coder-30B-A3B-Instruct": { input: 0.00, output: 0.00 }, | |
| "meta-llama/Llama-3.3-70B-Instruct": { input: 0.00, output: 0.00 }, | |
| "deepseek-ai/DeepSeek-V3.2": { input: 0.00, output: 0.00 }, | |
| "deepseek-ai/DeepSeek-R1": { input: 0.00, output: 0.00 }, | |
| // OpenRouter variants | |
| "anthropic/claude-opus-4-6": { input: 15.00, output: 75.00 }, | |
| "anthropic/claude-sonnet-4-6": { input: 3.00, output: 15.00 }, | |
| "google/gemini-2.5-pro": { input: 1.25, output: 10.00 }, | |
| "google/gemini-2.5-flash": { input: 0.15, output: 0.60 }, | |
| }; | |
| const rates = pricing[model] || { input: 1.00, output: 3.00 }; | |
| return (promptTokens * rates.input + completionTokens * rates.output) / 1_000_000; | |
| } | |