From bd11ee58268cfcbe83d8b1bdb84f819037425a67 Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 07:15:00 +0800 Subject: [PATCH 1/7] feat: auto-start Monitor on rescue dispatch via PostToolUse hook When Claude dispatches an opencode rescue task (via Agent tool or direct companion Bash call), this hook detects the new task-xxx id in the tool response and injects a system-reminder instructing Claude to start a persistent Monitor covering that id. On terminal states the Monitor emits a READY line pointing to the companion result command so Claude fetches the full payload and summarizes it for the user without needing to be asked. - New plugins/opencode/scripts/post-tool-use-monitor-hook.mjs - hooks.json: register PostToolUse (matcher: Agent|Bash, timeout 5s) Gracefully no-ops on non-matching tool output or missing companion markers. --- plugins/opencode/hooks/hooks.json | 12 ++ .../scripts/post-tool-use-monitor-hook.mjs | 159 ++++++++++++++++++ 2 files changed, 171 insertions(+) create mode 100644 plugins/opencode/scripts/post-tool-use-monitor-hook.mjs diff --git a/plugins/opencode/hooks/hooks.json b/plugins/opencode/hooks/hooks.json index c76f993..8c193dd 100644 --- a/plugins/opencode/hooks/hooks.json +++ b/plugins/opencode/hooks/hooks.json @@ -33,6 +33,18 @@ } ] } + ], + "PostToolUse": [ + { + "matcher": "Agent|Bash", + "hooks": [ + { + "type": "command", + "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-use-monitor-hook.mjs\"", + "timeout": 5 + } + ] + } ] } } diff --git a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs new file mode 100644 index 0000000..95d856d --- /dev/null +++ b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs @@ -0,0 +1,159 @@ +#!/usr/bin/env node + +// PostToolUse hook: watches for rescue-task dispatch in tool responses and +// injects a reminder that tells Claude to (a) start/refresh a Monitor +// covering the new task id(s), and (b) fetch + summarize the companion +// `result` payload when Monitor reports a terminal state. +// +// Why in a hook: the main Claude thread has no built-in way to observe +// background codex/opencode tasks. Without this, dispatching a rescue is +// fire-and-forget — the user has to ask for progress manually. The hook +// makes every rescue dispatch automatically get monitored and reported +// on, matching the UX of in-process subagents. + +import fs from "node:fs"; +import path from "node:path"; +import process from "node:process"; +import { fileURLToPath } from "node:url"; + +function readHookInput() { + try { + const raw = fs.readFileSync(0, "utf8").trim(); + if (!raw) return {}; + return JSON.parse(raw); + } catch { + return {}; + } +} + +// Companion task ids look like `task-moNNNNNN-NNNNNN`. +const TASK_ID_RE = /\btask-[a-z0-9]{6,}-[a-z0-9]{4,}\b/g; + +// Only react to responses that are unambiguously from the opencode companion, +// to avoid false positives on arbitrary text containing a task-like token. +const OPENCODE_MARKERS = [ + /OpenCode task started/i, + /opencode-companion\.mjs/, + /opencode:opencode-rescue/, + /opencode rescue/i, +]; + +function extractResponseText(response) { + if (response == null) return ""; + if (typeof response === "string") return response; + if (typeof response === "object") { + if (typeof response.result === "string") return response.result; + if (typeof response.content === "string") return response.content; + return JSON.stringify(response); + } + return String(response); +} + +function resolveCompanionPath() { + const here = fileURLToPath(import.meta.url); + return path.join(path.dirname(here), "opencode-companion.mjs"); +} + +function buildMonitorScript(ids, companionPath) { + const quoted = ids.map((id) => `"${id}"`).join(" "); + // The poll loop: + // - reads companion status JSON per id every 30s + // - emits a single line whenever status/phase changes + // - exits the loop as soon as every tracked id is terminal so the + // Monitor process ends cleanly; the main thread's Monitor tool sees + // exit and stops spawning events. + // + // stdout is the event stream — stay selective. On terminal states we + // emit a `READY: ` line so Claude knows the exact command to run + // to fetch the full result payload. + return [ + "set -u", + `COMP=${JSON.stringify(companionPath)}`, + `IDS=(${quoted})`, + "declare -A prev", + 'for id in "${IDS[@]}"; do prev[$id]=""; done', + "while true; do", + " all_done=1", + ' for id in "${IDS[@]}"; do', + ' json=$(node "$COMP" status "$id" --json 2>/dev/null || printf "{}")', + " st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}}))')", + ' if [ "$st" != "${prev[$id]}" ]; then', + ' ts=$(date +%H:%M:%S)', + ' echo "[$ts] opencode $id: ${prev[$id]:-none} → $st"', + ' case "$st" in', + " completed/*|failed/*|cancelled/*)", + ` echo "[$ts] READY opencode $id: run \\\"node $COMP result $id\\\" to fetch the full report"`, + " ;;", + " esac", + ' prev[$id]="$st"', + " fi", + ' case "$st" in completed/*|failed/*|cancelled/*) ;; *) all_done=0 ;; esac', + " done", + " if [ $all_done -eq 1 ]; then", + " echo \"[$(date +%H:%M:%S)] opencode: all tracked tasks terminal — exiting monitor\"", + " break", + " fi", + " sleep 30", + "done", + ].join("\n"); +} + +function buildReminder(ids, script) { + const idList = ids.join(", "); + return [ + "", + `You just dispatched ${ids.length} opencode rescue task(s): ${idList}.`, + "", + "Before you respond to the user, start a persistent Monitor so this session can observe task completion automatically. Use the `Monitor` tool:", + "", + '- description: "opencode rescue task state"', + "- persistent: true", + "- timeout_ms: 3600000", + "- command: the shell loop below, verbatim", + "", + "```bash", + script, + "```", + "", + "Behavior contract — follow this without being asked:", + "1. If a Monitor is already running in this session covering other task ids, stop it (`TaskStop`) and start a new one whose IDS array is the union of all currently-running rescue task ids. One Monitor per session.", + "2. When a Monitor event reports a task transitioning to `completed/…`, `failed/…`, or `cancelled/…`, immediately run the `READY` line's command (`node result `) in Bash, read the output, and surface a short summary of what the rescue task did (or failed at) to the user. Do not wait for the user to ask.", + "3. If the user explicitly asked you to skip monitoring, do nothing here.", + "", + ].join("\n"); +} + +function main() { + const input = readHookInput(); + const toolName = input.tool_name || ""; + // Rescue dispatch happens through the Agent tool wrapper; main thread may + // also call companion directly via Bash. Ignore other tools. + if (toolName !== "Agent" && toolName !== "Bash") return; + + const response = extractResponseText(input.tool_response); + if (!response) return; + if (!OPENCODE_MARKERS.some((r) => r.test(response))) return; + + const ids = [...new Set(response.match(TASK_ID_RE) || [])]; + if (ids.length === 0) return; + + const companionPath = resolveCompanionPath(); + const script = buildMonitorScript(ids, companionPath); + const additionalContext = buildReminder(ids, script); + + process.stdout.write( + JSON.stringify({ + hookSpecificOutput: { + hookEventName: "PostToolUse", + additionalContext, + }, + }), + ); +} + +try { + main(); +} catch { + // Best-effort — never block tool use on hook failure. + process.exit(0); +} From 02160b84477928e3fa4327432ec997b5c5468283 Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 07:26:58 +0800 Subject: [PATCH 2/7] fix(monitor): inline result fetch + fix parse-status JS syntax On terminal state the Monitor script now calls companion result and emits the truncated summary inline (bounded by OPENCODE_MONITOR_RESULT_CHARS, default 1500). Claude sees the result summary directly in the Monitor event and no longer needs a follow-up Bash call. Also fixes an extra trailing ) in the inline node -e expression that would have caused the status parser to syntax-error at runtime. --- .../scripts/post-tool-use-monitor-hook.mjs | 32 ++++++++++++------- 1 file changed, 20 insertions(+), 12 deletions(-) diff --git a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs index 95d856d..18a1766 100644 --- a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs +++ b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs @@ -56,33 +56,41 @@ function resolveCompanionPath() { function buildMonitorScript(ids, companionPath) { const quoted = ids.map((id) => `"${id}"`).join(" "); - // The poll loop: - // - reads companion status JSON per id every 30s + // The poll loop runs inside a Monitor child process: + // - polls companion status JSON per id every 30s // - emits a single line whenever status/phase changes - // - exits the loop as soon as every tracked id is terminal so the - // Monitor process ends cleanly; the main thread's Monitor tool sees - // exit and stops spawning events. - // - // stdout is the event stream — stay selective. On terminal states we - // emit a `READY: ` line so Claude knows the exact command to run - // to fetch the full result payload. + // - on terminal state, fetches `companion result `, truncates to + // a bounded size, and prints it as multi-line output; Monitor batches + // lines within ~200ms into one notification, so the main thread + // sees a single event carrying "task done + full summary" without + // needing a follow-up tool call to fetch the result + // - exits when every tracked id is terminal (Monitor process ends + // cleanly, no runaway background poller) return [ "set -u", `COMP=${JSON.stringify(companionPath)}`, `IDS=(${quoted})`, + "RESULT_MAX_CHARS=${OPENCODE_MONITOR_RESULT_CHARS:-1500}", "declare -A prev", 'for id in "${IDS[@]}"; do prev[$id]=""; done', "while true; do", " all_done=1", ' for id in "${IDS[@]}"; do', ' json=$(node "$COMP" status "$id" --json 2>/dev/null || printf "{}")', - " st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}}))')", + " st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}})')", ' if [ "$st" != "${prev[$id]}" ]; then', ' ts=$(date +%H:%M:%S)', ' echo "[$ts] opencode $id: ${prev[$id]:-none} → $st"', ' case "$st" in', " completed/*|failed/*|cancelled/*)", - ` echo "[$ts] READY opencode $id: run \\\"node $COMP result $id\\\" to fetch the full report"`, + ' result=$(node "$COMP" result "$id" 2>/dev/null || true)', + " # Truncate defensively so Monitor output stays bounded.", + ' summary=$(printf "%s" "$result" | head -c "$RESULT_MAX_CHARS")', + ' echo "[$ts] opencode $id TERMINAL=$st — result summary:"', + ' echo "--- result-begin $id ---"', + ' printf "%s" "$summary"', + ' echo ""', + ' echo "--- result-end $id ---"', " ;;", " esac", ' prev[$id]="$st"', @@ -117,7 +125,7 @@ function buildReminder(ids, script) { "", "Behavior contract — follow this without being asked:", "1. If a Monitor is already running in this session covering other task ids, stop it (`TaskStop`) and start a new one whose IDS array is the union of all currently-running rescue task ids. One Monitor per session.", - "2. When a Monitor event reports a task transitioning to `completed/…`, `failed/…`, or `cancelled/…`, immediately run the `READY` line's command (`node result `) in Bash, read the output, and surface a short summary of what the rescue task did (or failed at) to the user. Do not wait for the user to ask.", + "2. The Monitor script above already fetches `companion result ` and emits the summary inline on terminal state — each terminal event carries the full result block between `--- result-begin ---` and `--- result-end ---` markers. You do NOT need to run a follow-up Bash call to get the result; just read the Monitor event and surface a short summary of what the rescue task did (or failed at) to the user.", "3. If the user explicitly asked you to skip monitoring, do nothing here.", "", ].join("\n"); From 31febd0b042ee20e50cc7b90d55c185323936c29 Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 07:31:23 +0800 Subject: [PATCH 3/7] fix(server): race sendPrompt against completion watcher + env-tunable timeouts OpenCode server's POST /session/:id/message occasionally fails to close its HTTP response after the session emits the terminal assistant message (observed with glm-5 backend, opencode 1.4.x). Without this fix, sendPrompt hangs until AbortSignal fires, leaving the companion job stuck in 'investigating' status until the (previously 5 min) timeout. Changes: - Race the POST fetch against a /session/:id/message polling watcher; whichever returns first aborts the other. Watcher only accepts a completion whose info.time.completed >= prompt startedAt. - Bump generic request() timeout and sendPrompt timeout to 30 min, configurable via OPENCODE_REQUEST_TIMEOUT_MS / OPENCODE_PROMPT_TIMEOUT_MS env vars. - Completion poll interval configurable via OPENCODE_COMPLETION_POLL_MS (default 5s). --- .../opencode/scripts/lib/opencode-server.mjs | 96 ++++++++++++++++--- 1 file changed, 84 insertions(+), 12 deletions(-) diff --git a/plugins/opencode/scripts/lib/opencode-server.mjs b/plugins/opencode/scripts/lib/opencode-server.mjs index f4192a8..1d32943 100644 --- a/plugins/opencode/scripts/lib/opencode-server.mjs +++ b/plugins/opencode/scripts/lib/opencode-server.mjs @@ -8,6 +8,12 @@ const DEFAULT_PORT = 4096; const DEFAULT_HOST = "127.0.0.1"; const SERVER_START_TIMEOUT = 30_000; +// Long-running tasks (e.g. engine builds, large refactors) can easily exceed +// the old 5-10 min caps, causing `fetch failed` at a fixed deadline. Default +// to 30 min; override via env for even longer workloads. +const REQUEST_TIMEOUT_MS = Number(process.env.OPENCODE_REQUEST_TIMEOUT_MS) || 1_800_000; +const PROMPT_TIMEOUT_MS = Number(process.env.OPENCODE_PROMPT_TIMEOUT_MS) || 1_800_000; + /** * Check if an OpenCode server is already running on the given port. * @param {string} host @@ -87,7 +93,7 @@ export function createClient(baseUrl, opts = {}) { method, headers, body: body != null ? JSON.stringify(body) : undefined, - signal: AbortSignal.timeout(300_000), + signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS), }); if (!res.ok) { const text = await res.text().catch(() => ""); @@ -127,6 +133,17 @@ export function createClient(baseUrl, opts = {}) { /** * Send a prompt (synchronous / streaming). * Returns the full response text from SSE stream. + * + * NOTE: OpenCode's POST /session/:id/message occasionally fails to close + * its HTTP response body after the session emits its terminal assistant + * message (observed against glm-5 backend, opencode 1.4.x). Relying on + * res.json() alone means the caller hangs until AbortSignal fires, which + * breaks downstream job-completion detection in the companion. + * + * Workaround: race the fetch against a session-completion watcher that + * polls GET /session/:id/message. When the latest assistant message has + * info.time.completed set AND finish !== undefined, the session is done; + * we abort the hanging fetch and synthesize the response from the poll. */ sendPrompt: async (sessionId, promptText, opts = {}) => { const body = { @@ -136,19 +153,74 @@ export function createClient(baseUrl, opts = {}) { if (opts.model) body.model = opts.model; if (opts.system) body.system = opts.system; - const res = await fetch(`${baseUrl}/session/${sessionId}/message`, { - method: "POST", - headers, - body: JSON.stringify(body), - signal: AbortSignal.timeout(600_000), // 10 min for long tasks - }); + const ac = new AbortController(); + const timeoutId = setTimeout(() => ac.abort(new Error("prompt timeout")), PROMPT_TIMEOUT_MS); + const startedAt = Date.now(); + // Grace period so we don't mistake "session had no prior activity" for + // completion before the new prompt has even begun generating. + const MIN_POLL_DELAY_MS = 5_000; + const POLL_INTERVAL_MS = Number(process.env.OPENCODE_COMPLETION_POLL_MS) || 5_000; - if (!res.ok) { - const text = await res.text().catch(() => ""); - throw new Error(`OpenCode prompt failed ${res.status}: ${text}`); - } + const fetchPromise = (async () => { + const res = await fetch(`${baseUrl}/session/${sessionId}/message`, { + method: "POST", + headers, + body: JSON.stringify(body), + signal: ac.signal, + }); + if (!res.ok) { + const text = await res.text().catch(() => ""); + throw new Error(`OpenCode prompt failed ${res.status}: ${text}`); + } + return { source: "fetch", data: await res.json() }; + })(); - return res.json(); + const watcherPromise = (async () => { + // Wait briefly so the new generation has a chance to start and we + // don't latch onto a stale completed message from before this prompt. + await new Promise((r) => setTimeout(r, MIN_POLL_DELAY_MS)); + while (!ac.signal.aborted) { + try { + const params = new URLSearchParams({ limit: "1" }); + const r = await fetch( + `${baseUrl}/session/${sessionId}/message?${params.toString()}`, + { headers, signal: AbortSignal.timeout(10_000) }, + ); + if (r.ok) { + const arr = await r.json(); + const last = Array.isArray(arr) ? arr[arr.length - 1] : null; + const info = last?.info; + // Only treat assistant messages created *after* this prompt + // started as a completion signal for this call. + if ( + info && + info.role === "assistant" && + typeof info.time?.completed === "number" && + info.time.completed >= startedAt && + typeof info.finish === "string" + ) { + return { source: "watcher", data: last }; + } + } + } catch { + // Ignore transient poll errors; keep waiting. + } + await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); + } + throw new Error("watcher aborted"); + })(); + + try { + const winner = await Promise.race([fetchPromise, watcherPromise]); + // Whichever arrived first, cancel the other. + ac.abort(); + // Swallow the loser's rejection to avoid unhandled rejection noise. + fetchPromise.catch(() => {}); + watcherPromise.catch(() => {}); + return winner.data; + } finally { + clearTimeout(timeoutId); + } }, /** From 6d48b0fb274dbeca1f6a963a2a66a163187196b6 Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 08:34:17 +0800 Subject: [PATCH 4/7] fix(status): honor --json flag and single-task lookup MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `status` handler was ignoring argv entirely — `--json` was silently dropped and positional task ids were never matched. Tooling that piped status through jq would choke on the markdown fallback with "parse error: Invalid numeric literal". Now: - `status --json` emits a workspace snapshot as JSON ({workspaceRoot, running, latestFinished, recent}) - `status [--json]` looks up a single job by id/prefix. JSON form is {workspaceRoot, job: } so callers can always read .job.status safely. - `status --all` widens from session-scoped to all-sessions (useful for cross-session observers like monitor scripts) - Markdown output unchanged for the no-flag case. --- .../opencode/scripts/opencode-companion.mjs | 53 ++++++++++++++++++- 1 file changed, 51 insertions(+), 2 deletions(-) diff --git a/plugins/opencode/scripts/opencode-companion.mjs b/plugins/opencode/scripts/opencode-companion.mjs index 48e526b..7d57a19 100644 --- a/plugins/opencode/scripts/opencode-companion.mjs +++ b/plugins/opencode/scripts/opencode-companion.mjs @@ -13,7 +13,7 @@ import { isOpencodeInstalled, getOpencodeVersion, spawnDetached } from "./lib/pr import { isServerRunning, ensureServer, createClient, connect } from "./lib/opencode-server.mjs"; import { resolveWorkspace } from "./lib/workspace.mjs"; import { loadState, updateState, upsertJob, generateJobId, jobDataPath } from "./lib/state.mjs"; -import { buildStatusSnapshot, resolveResultJob, resolveCancelableJob, enrichJob } from "./lib/job-control.mjs"; +import { buildStatusSnapshot, resolveResultJob, resolveCancelableJob, enrichJob, matchJobReference } from "./lib/job-control.mjs"; import { createJobRecord, runTrackedJob, getClaudeSessionId } from "./lib/tracked-jobs.mjs"; import { renderStatus, renderResult, renderReview, renderSetup } from "./lib/render.mjs"; import { buildReviewPrompt, buildTaskPrompt } from "./lib/prompts.mjs"; @@ -424,11 +424,60 @@ async function handleTaskResumeCandidate(argv) { // ------------------------------------------------------------------ async function handleStatus(argv) { + const { options, positional } = parseArgs(argv ?? [], { + booleanOptions: ["json", "all"], + }); + const workspace = await resolveWorkspace(); const state = loadState(workspace); const sessionId = getClaudeSessionId(); + const jobs = state.jobs ?? []; + const wantJson = !!options.json; + // --all widens the snapshot filter to every session's jobs; without --all we + // still filter to the current Claude session for the existing markdown UX. + const sessionFilter = options.all ? undefined : sessionId; + const ref = positional?.[0]; + + // Single-task query — `status [--json]`. + if (ref) { + const { job, ambiguous } = matchJobReference(jobs, ref); + if (ambiguous) { + if (wantJson) { + console.log(JSON.stringify({ workspaceRoot: workspace, job: null, error: "ambiguous" })); + } else { + console.error(`Ambiguous job reference "${ref}". Please provide a more specific ID prefix.`); + } + process.exit(ambiguous ? 2 : 0); + return; + } + if (wantJson) { + const enriched = job ? enrichJob(job, workspace) : null; + console.log(JSON.stringify({ workspaceRoot: workspace, job: enriched })); + return; + } + if (!job) { + console.log(`No job found for "${ref}" in workspace ${workspace}.`); + return; + } + console.log(renderStatus({ running: [], latestFinished: null, recent: [enrichJob(job, workspace)] })); + return; + } + + const snapshot = buildStatusSnapshot(jobs, workspace, { sessionId: sessionFilter }); + + if (wantJson) { + // Machine-readable shape mirrors the single-task case so callers can treat + // both uniformly: a `.job` field is present for single-task, otherwise + // `.running`/`.recent` arrays describe the whole workspace snapshot. + console.log(JSON.stringify({ + workspaceRoot: workspace, + running: snapshot.running, + latestFinished: snapshot.latestFinished, + recent: snapshot.recent, + })); + return; + } - const snapshot = buildStatusSnapshot(state.jobs ?? [], workspace, { sessionId }); console.log(renderStatus(snapshot)); } From c378dfdfc4d6e0abf2a514fd16f3616994f74b77 Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 09:27:33 +0800 Subject: [PATCH 5/7] feat(monitor): surface progress activity and heartbeat MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously the Monitor script only emitted on status/phase transitions. For long-running tasks that sit in 'running/investigating' for many minutes, the user saw one initial event and then nothing — no way to tell if the task was still alive. Now: - Include the last line of progressPreview in the state signature so any new log activity inside the task triggers an event (with elapsed time + latest log snippet) - Emit a heartbeat every HEARTBEAT_POLLS ticks (default 10 = ~5min) with current status/phase/elapsed even when nothing has changed - Both tunable via OPENCODE_MONITOR_HEARTBEAT_POLLS env var --- .../scripts/post-tool-use-monitor-hook.mjs | 69 ++++++++++++------- 1 file changed, 43 insertions(+), 26 deletions(-) diff --git a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs index 18a1766..77900a1 100644 --- a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs +++ b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs @@ -58,44 +58,61 @@ function buildMonitorScript(ids, companionPath) { const quoted = ids.map((id) => `"${id}"`).join(" "); // The poll loop runs inside a Monitor child process: // - polls companion status JSON per id every 30s - // - emits a single line whenever status/phase changes - // - on terminal state, fetches `companion result `, truncates to - // a bounded size, and prints it as multi-line output; Monitor batches - // lines within ~200ms into one notification, so the main thread - // sees a single event carrying "task done + full summary" without - // needing a follow-up tool call to fetch the result - // - exits when every tracked id is terminal (Monitor process ends - // cleanly, no runaway background poller) + // - emits an event when status/phase OR the latest progressPreview log + // line changes, so long-running tasks surface intermediate activity + // - emits a heartbeat every HEARTBEAT_POLLS ticks (default 10 = ~5min) + // so the user sees signs of life even when nothing has changed + // - on terminal state, fetches `companion result `, truncates, and + // prints a multi-line summary so the main thread gets a single batched + // event carrying the full report + // - exits when every tracked id is terminal return [ "set -u", `COMP=${JSON.stringify(companionPath)}`, `IDS=(${quoted})`, "RESULT_MAX_CHARS=${OPENCODE_MONITOR_RESULT_CHARS:-1500}", + "HEARTBEAT_POLLS=${OPENCODE_MONITOR_HEARTBEAT_POLLS:-10}", "declare -A prev", - 'for id in "${IDS[@]}"; do prev[$id]=""; done', + "declare -A hb", + 'for id in "${IDS[@]}"; do prev[$id]=""; hb[$id]=0; done', "while true; do", " all_done=1", ' for id in "${IDS[@]}"; do', ' json=$(node "$COMP" status "$id" --json 2>/dev/null || printf "{}")', - " st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}})')", - ' if [ "$st" != "${prev[$id]}" ]; then', + " fields=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);const jb=j.job||{};const prog=String(jb.progressPreview||\"\").split(\"\\n\").filter(Boolean);const last=(prog[prog.length-1]||\"\").replace(/[|\\r\\n]/g,\" \").slice(0,200);process.stdout.write([jb.status||\"unknown\",jb.phase||\"\",jb.elapsed||\"\",last].join(\"|\"))}catch(e){process.stdout.write(\"parse-err|||\")}})')", + " IFS='|' read -r st phase elapsed last <<< \"$fields\"", + ' sig="${st}/${phase}|${last}"', + ' if [ "$sig" != "${prev[$id]}" ]; then', ' ts=$(date +%H:%M:%S)', - ' echo "[$ts] opencode $id: ${prev[$id]:-none} → $st"', - ' case "$st" in', - " completed/*|failed/*|cancelled/*)", - ' result=$(node "$COMP" result "$id" 2>/dev/null || true)', - " # Truncate defensively so Monitor output stays bounded.", - ' summary=$(printf "%s" "$result" | head -c "$RESULT_MAX_CHARS")', - ' echo "[$ts] opencode $id TERMINAL=$st — result summary:"', - ' echo "--- result-begin $id ---"', - ' printf "%s" "$summary"', - ' echo ""', - ' echo "--- result-end $id ---"', - " ;;", - " esac", - ' prev[$id]="$st"', + ' if [ -n "$last" ]; then', + ' echo "[$ts] opencode $id: $st/$phase (elapsed $elapsed) — $last"', + " else", + ' echo "[$ts] opencode $id: $st/$phase (elapsed $elapsed)"', + " fi", + ' prev[$id]="$sig"', + " hb[$id]=0", + " else", + ' hb[$id]=$(( ${hb[$id]} + 1 ))', + ' if [ "${hb[$id]}" -ge "$HEARTBEAT_POLLS" ]; then', + ' ts=$(date +%H:%M:%S)', + ' echo "[$ts] opencode $id: heartbeat — still $st/$phase (elapsed $elapsed)"', + " hb[$id]=0", + " fi", " fi", - ' case "$st" in completed/*|failed/*|cancelled/*) ;; *) all_done=0 ;; esac', + ' case "$st" in', + " completed|failed|cancelled)", + ' result=$(node "$COMP" result "$id" 2>/dev/null || true)', + " # Truncate defensively so Monitor output stays bounded.", + ' summary=$(printf "%s" "$result" | head -c "$RESULT_MAX_CHARS")', + ' ts=$(date +%H:%M:%S)', + ' echo "[$ts] opencode $id TERMINAL=$st — result summary:"', + ' echo "--- result-begin $id ---"', + ' printf "%s" "$summary"', + ' echo ""', + ' echo "--- result-end $id ---"', + " ;;", + ' *) all_done=0 ;;', + " esac", " done", " if [ $all_done -eq 1 ]; then", " echo \"[$(date +%H:%M:%S)] opencode: all tracked tasks terminal — exiting monitor\"", From ec2757353e7e3a9a7b9f9e79a123b88013112566 Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 14:37:20 +0800 Subject: [PATCH 6/7] feat(auto-heal): reconcile stuck jobs via session terminal probe Long-running background tasks occasionally get stuck in investigating status after the OpenCode session has finished server-side (POST body never closes, watcher misses the terminal signal, or task-worker dies). - New lib/auto-heal.mjs probes GET /session/:id/message?limit=1 and transitions the local job to completed when the last assistant message has info.finish set and info.time.completed >= job.startedAt. If the task-worker PID is dead and the session is silent >60s, the job is marked failed with a clear reason. - status, result, and task-resume-candidate run a silent heal pass before reading state so they never report a false "running" for a session that is actually complete. - New `companion.mjs heal` subcommand scans and reconciles in bulk, with --dry-run / --json / --all flags. - Heal is a no-op when the server is unreachable, so offline use of status/result keeps working. --- README.md | 27 ++ plugins/opencode/scripts/lib/auto-heal.mjs | 283 ++++++++++++++++++ .../opencode/scripts/opencode-companion.mjs | 90 +++++- 3 files changed, 397 insertions(+), 3 deletions(-) create mode 100644 plugins/opencode/scripts/lib/auto-heal.mjs diff --git a/README.md b/README.md index 8c14ad3..a18b686 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,33 @@ To check your configured providers: When enabled via `/opencode:setup --enable-review-gate`, a Stop hook runs a targeted OpenCode review on Claude's response. If issues are found, the stop is blocked so Claude can address them first. Warning: can create long-running loops and drain usage limits. +## Job Auto-Heal + +Long-running tasks spawned via `/opencode:task --background` occasionally get +stuck in `investigating` status even after the OpenCode session has finished +server-side — typically because `POST /session/:id/message` fails to close its +HTTP body, the task-worker is killed, or the companion's watcher misses the +terminal signal. + +The companion now reconciles this automatically: + +- `companion.mjs status` and `companion.mjs result` run a silent auto-heal + pass before they read state, so they never report a false "running" state + for a session that is actually complete. +- `companion.mjs heal` scans for stuck jobs and reconciles them in bulk. Pass + `--dry-run` to preview, `--json` for machine-readable output, and `--all` + to include jobs from other Claude sessions. + +Each heal check queries `GET /session/:id/message?limit=1`. If the last +assistant message has `info.finish` set and `info.time.completed >= job.startedAt`, +the job is transitioned to `completed` and the message text is persisted to +the job data file. If the task-worker PID is dead and the session has been +silent for >60 s, the job is transitioned to `failed` with a clear reason. + +If the OpenCode server is unreachable, auto-heal is a no-op — status/result +commands still work, they just can't move stuck jobs forward until the server +comes back. + ## Troubleshooting
diff --git a/plugins/opencode/scripts/lib/auto-heal.mjs b/plugins/opencode/scripts/lib/auto-heal.mjs new file mode 100644 index 0000000..88a555b --- /dev/null +++ b/plugins/opencode/scripts/lib/auto-heal.mjs @@ -0,0 +1,283 @@ +// Session-level auto-heal for tracked jobs. +// +// Background: task-worker subprocesses wrap `client.sendPrompt(sid, ...)` in +// runTrackedJob so that on successful return the job flips status→completed +// and the response text is persisted to jobDataPath. But sendPrompt can hang +// or the worker can be killed before that return happens — even though the +// OpenCode session itself completed cleanly server-side. The job then stays +// in a non-terminal state ("investigating"/"running") forever and downstream +// Monitor scripts never see the true finish. +// +// This module provides a best-effort reconciliation pass: given a job with +// an `opencodeSessionId`, query the OpenCode server for the last assistant +// message in that session. If it looks terminal (info.finish set and +// completed >= job.startedAt), upsert the job as completed and persist the +// text. If the worker process is gone and the session has been idle long +// enough, mark as failed with a clear error message. +// +// All functions are no-ops (or log to stderr and return the original job) +// when the server is unreachable, so callers can sprinkle autoHealJob at +// the top of status-reading paths without wrapping in try/catch themselves. + +import fs from "node:fs"; +import path from "node:path"; + +import { ensureDir } from "./fs.mjs"; +import { upsertJob, jobDataPath } from "./state.mjs"; + +const DEFAULT_BASE_URL = "http://127.0.0.1:4096"; +// A worker/session can be legitimately silent for a while (big model thinking, +// slow tool) — only declare it dead after >60s of no session activity AND no +// live task-worker process. 60s matches the spec. +const STALE_IDLE_MS = 60_000; + +function buildHeaders() { + const headers = { "Content-Type": "application/json" }; + if (process.env.OPENCODE_SERVER_PASSWORD) { + const user = process.env.OPENCODE_SERVER_USERNAME ?? "opencode"; + const cred = Buffer.from(`${user}:${process.env.OPENCODE_SERVER_PASSWORD}`).toString("base64"); + headers["Authorization"] = `Basic ${cred}`; + } + return headers; +} + +/** + * True if the given PID is currently alive. Treats missing/invalid PID as dead. + * @param {number|undefined|null} pid + * @returns {boolean} + */ +export function isProcessAlive(pid) { + if (!pid || !Number.isInteger(pid) || pid <= 0) return false; + try { + // Signal 0 is a permission/existence probe — no signal delivered. + process.kill(pid, 0); + return true; + } catch (err) { + // ESRCH = no such process. EPERM = process exists but we can't signal it + // (still alive from our perspective). + return err.code === "EPERM"; + } +} + +/** + * Extract visible text from an OpenCode message `parts` array. + * @param {Array|undefined} parts + * @returns {string} + */ +function extractPartsText(parts) { + if (!Array.isArray(parts)) return ""; + return parts + .filter((p) => p?.type === "text" && typeof p.text === "string") + .map((p) => p.text) + .join("\n"); +} + +/** + * Query the opencode server for the terminal state of a session. + * + * Returns: + * { terminal: true, finish, completed, text, info } when the last assistant + * message has info.time.completed >= startedAt AND typeof info.finish === 'string'. + * { terminal: false, reachable: true, lastUpdatedAt, lastInfo } when session exists but no terminal marker. + * { terminal: false, reachable: false, error } when server unreachable / errored. + * + * @param {string} baseUrl + * @param {string} sessionId + * @param {number} startedAtMs - epoch ms; only treat completions >= this as ours + * @param {object} [headers] + */ +export async function probeSessionTerminal(baseUrl, sessionId, startedAtMs, headers) { + const h = headers ?? buildHeaders(); + try { + // limit=1 → last message only. On glm-5 / opencode 1.4.x this returns + // an array of { info, parts } objects. + const res = await fetch(`${baseUrl}/session/${sessionId}/message?limit=1`, { + method: "GET", + headers: h, + signal: AbortSignal.timeout(10_000), + }); + if (!res.ok) { + return { terminal: false, reachable: true, error: `HTTP ${res.status}` }; + } + const arr = await res.json(); + const last = Array.isArray(arr) ? arr[arr.length - 1] : null; + const info = last?.info; + if (!info) { + return { terminal: false, reachable: true, lastUpdatedAt: 0, lastInfo: null }; + } + + const completed = typeof info.time?.completed === "number" ? info.time.completed : 0; + const created = typeof info.time?.created === "number" ? info.time.created : 0; + const lastUpdatedAt = Math.max(completed, created); + + const looksTerminal = + info.role === "assistant" && + typeof info.finish === "string" && + completed >= (startedAtMs || 0); + + if (looksTerminal) { + return { + terminal: true, + finish: info.finish, + completed, + text: extractPartsText(last.parts), + info, + }; + } + return { terminal: false, reachable: true, lastUpdatedAt, lastInfo: info }; + } catch (err) { + return { terminal: false, reachable: false, error: err.message }; + } +} + +/** + * Parse an ISO-ish timestamp that might be a number or string. Returns epoch ms, or 0. + */ +function toEpochMs(v) { + if (v == null) return 0; + if (typeof v === "number") return v < 1e12 ? v * 1000 : v; // tolerate seconds + const t = new Date(v).getTime(); + return Number.isFinite(t) ? t : 0; +} + +/** + * Attempt to auto-heal a single job. Mutates persistent state via upsertJob + * on transitions. Returns the up-to-date job record (healed or not). + * + * @param {string} workspace + * @param {object} job + * @param {object} [opts] + * @param {string} [opts.baseUrl] + * @param {boolean} [opts.dryRun] - when true, do not write state; return `{job, action, details}` + */ +export async function autoHealJob(workspace, job, opts = {}) { + const baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL; + const dryRun = !!opts.dryRun; + const HEALABLE = new Set(["starting", "investigating", "running", "finalizing"]); + + if (!job || !job.opencodeSessionId) { + return { job, action: "skip", reason: "no opencodeSessionId" }; + } + if (!HEALABLE.has(job.status)) { + return { job, action: "skip", reason: `status=${job.status} not healable` }; + } + + const startedAtMs = + toEpochMs(job.startedAt) || + toEpochMs(job.createdAt) || + toEpochMs(job.updatedAt) || + 0; + + const probe = await probeSessionTerminal(baseUrl, job.opencodeSessionId, startedAtMs); + + if (probe.terminal) { + const completedIso = new Date(probe.completed).toISOString(); + const summary = (probe.text || "").slice(0, 500); + if (dryRun) { + return { + job, + action: "would-complete", + details: { + finish: probe.finish, + completedAt: completedIso, + textLen: (probe.text || "").length, + }, + }; + } + + // Persist the result payload to disk so handleResult can surface it. + try { + const dataFile = jobDataPath(workspace, job.id); + ensureDir(path.dirname(dataFile)); + const payload = { + rendered: probe.text, + summary, + healed: true, + finish: probe.finish, + }; + fs.writeFileSync(dataFile, JSON.stringify(payload, null, 2), "utf8"); + } catch (err) { + // Non-fatal: the status transition below is still useful. + process.stderr.write(`auto-heal: failed to write data file for ${job.id}: ${err.message}\n`); + } + + upsertJob(workspace, { + id: job.id, + status: "completed", + completedAt: completedIso, + phase: "completed", + result: summary || job.result || null, + healed: true, + finish: probe.finish, + }); + return { + job: { ...job, status: "completed", completedAt: completedIso, result: summary, healed: true, finish: probe.finish }, + action: "healed-completed", + details: { finish: probe.finish, textLen: (probe.text || "").length }, + }; + } + + // Not terminal. Can we at least declare it dead? + if (!probe.reachable) { + return { job, action: "skip", reason: `server unreachable: ${probe.error}` }; + } + + const workerAlive = isProcessAlive(job.pid); + if (workerAlive) { + return { job, action: "skip", reason: "worker still alive" }; + } + + const lastUpdateMs = probe.lastUpdatedAt || toEpochMs(job.updatedAt); + const idleMs = lastUpdateMs ? Date.now() - lastUpdateMs : Infinity; + if (idleMs < STALE_IDLE_MS) { + return { job, action: "skip", reason: `idle ${Math.floor(idleMs / 1000)}s < ${STALE_IDLE_MS / 1000}s threshold` }; + } + + const idleSec = Number.isFinite(idleMs) ? Math.floor(idleMs / 1000) : -1; + const errMsg = `task-worker exited without completion; session last updated ${idleSec}s ago`; + + if (dryRun) { + return { job, action: "would-fail", details: { errorMessage: errMsg } }; + } + + upsertJob(workspace, { + id: job.id, + status: "failed", + completedAt: new Date().toISOString(), + errorMessage: errMsg, + healed: true, + }); + return { + job: { ...job, status: "failed", errorMessage: errMsg, healed: true }, + action: "healed-failed", + details: { errorMessage: errMsg }, + }; +} + +/** + * Auto-heal a list of jobs, returning the (possibly updated) jobs in the same + * order, plus a list of heal actions for reporting. + * + * @param {string} workspace + * @param {object[]} jobs + * @param {object} [opts] + * @returns {Promise<{ jobs: object[], actions: object[] }>} + */ +export async function autoHealJobs(workspace, jobs, opts = {}) { + const actions = []; + const out = []; + for (const j of jobs ?? []) { + try { + const r = await autoHealJob(workspace, j, opts); + out.push(r.job ?? j); + if (r.action && r.action !== "skip") { + actions.push({ id: j.id, action: r.action, details: r.details }); + } + } catch (err) { + // Auto-heal must never crash the caller. + process.stderr.write(`auto-heal: ${j.id} errored: ${err.message}\n`); + out.push(j); + } + } + return { jobs: out, actions }; +} diff --git a/plugins/opencode/scripts/opencode-companion.mjs b/plugins/opencode/scripts/opencode-companion.mjs index 7d57a19..eff3b51 100644 --- a/plugins/opencode/scripts/opencode-companion.mjs +++ b/plugins/opencode/scripts/opencode-companion.mjs @@ -19,6 +19,7 @@ import { renderStatus, renderResult, renderReview, renderSetup } from "./lib/ren import { buildReviewPrompt, buildTaskPrompt } from "./lib/prompts.mjs"; import { getDiff, getStatus as getGitStatus } from "./lib/git.mjs"; import { readJson } from "./lib/fs.mjs"; +import { autoHealJob, autoHealJobs } from "./lib/auto-heal.mjs"; const PLUGIN_ROOT = process.env.CLAUDE_PLUGIN_ROOT || path.resolve(import.meta.dirname, ".."); @@ -38,6 +39,7 @@ const handlers = { status: handleStatus, result: handleResult, cancel: handleCancel, + heal: handleHeal, }; const handler = handlers[subcommand]; @@ -397,9 +399,20 @@ async function handleTaskResumeCandidate(argv) { const { options } = parseArgs(argv, { booleanOptions: ["json"] }); const workspace = await resolveWorkspace(); - const state = loadState(workspace); + let state = loadState(workspace); const sessionId = getClaudeSessionId(); + // Heal first so "latest completed" reflects session reality, not a stale + // "running" flag from a dead worker. + const healable = (state.jobs ?? []).filter( + (j) => j.type === "task" && j.opencodeSessionId && + ["starting", "investigating", "running", "finalizing"].includes(j.status), + ); + if (healable.length > 0) { + await autoHealJobs(workspace, healable); + state = loadState(workspace); + } + const lastTask = state.jobs ?.filter((j) => j.type === "task" && j.opencodeSessionId) ?.filter((j) => j.status === "completed" || j.status === "running") @@ -429,8 +442,18 @@ async function handleStatus(argv) { }); const workspace = await resolveWorkspace(); - const state = loadState(workspace); + let state = loadState(workspace); const sessionId = getClaudeSessionId(); + // Auto-heal stuck jobs before building the snapshot so `status` never lies + // about completion. Safe on ECONNREFUSED (probe returns reachable:false). + const healable = (state.jobs ?? []).filter( + (j) => j.opencodeSessionId && + ["starting", "investigating", "running", "finalizing"].includes(j.status), + ); + if (healable.length > 0) { + await autoHealJobs(workspace, healable); + state = loadState(workspace); + } const jobs = state.jobs ?? []; const wantJson = !!options.json; // --all widens the snapshot filter to every session's jobs; without --all we @@ -486,7 +509,18 @@ async function handleResult(argv) { const ref = positional[0]; const workspace = await resolveWorkspace(); - const state = loadState(workspace); + let state = loadState(workspace); + // Auto-heal before resolving so that if the caller asks for the latest + // result, we don't return "no finished job" while a silently-completed + // session is waiting to be reconciled. + const healable = (state.jobs ?? []).filter( + (j) => j.opencodeSessionId && + ["starting", "investigating", "running", "finalizing"].includes(j.status), + ); + if (healable.length > 0) { + await autoHealJobs(workspace, healable); + state = loadState(workspace); + } const { job, ambiguous } = resolveResultJob(state.jobs ?? [], ref); @@ -557,6 +591,56 @@ async function handleCancel(argv) { console.log(`Canceled job: ${job.id}`); } +// ------------------------------------------------------------------ +// Heal (batch auto-reconcile stuck jobs) +// ------------------------------------------------------------------ + +async function handleHeal(argv) { + const { options } = parseArgs(argv ?? [], { + booleanOptions: ["json", "dry-run", "all"], + }); + + const workspace = await resolveWorkspace(); + const state = loadState(workspace); + const sessionId = getClaudeSessionId(); + const dryRun = !!options["dry-run"]; + + let jobs = state.jobs ?? []; + if (!options.all && sessionId) { + jobs = jobs.filter((j) => !j.sessionId || j.sessionId === sessionId); + } + + const healable = jobs.filter( + (j) => j.opencodeSessionId && + ["starting", "investigating", "running", "finalizing"].includes(j.status), + ); + + const { actions } = await autoHealJobs(workspace, healable, { dryRun }); + + if (options.json) { + console.log(JSON.stringify({ + workspaceRoot: workspace, + dryRun, + scanned: healable.length, + actions, + }, null, 2)); + return; + } + + console.log(`## Auto-Heal ${dryRun ? "(dry-run)" : ""}\n`); + console.log(`- Workspace: ${workspace}`); + console.log(`- Scanned stuck jobs: ${healable.length}`); + if (actions.length === 0) { + console.log(`- No actions needed.`); + return; + } + console.log(`- Actions: ${actions.length}\n`); + for (const a of actions) { + const det = a.details ? ` — ${JSON.stringify(a.details)}` : ""; + console.log(`- **${a.id}**: ${a.action}${det}`); + } +} + // ------------------------------------------------------------------ // Helpers // ------------------------------------------------------------------ From deb34193c9573ce0df5d7f3dfa74e939c904596c Mon Sep 17 00:00:00 2001 From: suharvest Date: Sat, 18 Apr 2026 14:37:34 +0800 Subject: [PATCH 7/7] feat(server): idle + bash-stuck detectors in sendPrompt watcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Raise the absolute prompt timeout to 4h as a pure safety cap and move real stall detection into the watcher so long-but-alive tasks aren't killed by a fixed deadline. - Idle timeout (OPENCODE_IDLE_TIMEOUT_MS, default 15min): abort when the session shows no message/part/tool-output change for too long. - Bash-tool stuck detector: when the latest tool is a bash in status running but `opencode serve` has zero child processes for N consecutive polls (default 3 × 5s), abort. This catches the ask-permission deadlock (sst/opencode#14473) where the shell process already exited cleanly but tool state never flipped to completed. Gracefully degrades on Windows or when lsof/pgrep is unavailable. - Restructure fetch-vs-watcher race so a rejection from one side no longer cancels the other. The server's 5-min POST cap used to kill sendPrompt before the watcher could observe completion; now both settle independently and we prefer whichever succeeded. --- .../opencode/scripts/lib/opencode-server.mjs | 189 +++++++++++++++++- 1 file changed, 178 insertions(+), 11 deletions(-) diff --git a/plugins/opencode/scripts/lib/opencode-server.mjs b/plugins/opencode/scripts/lib/opencode-server.mjs index 1d32943..f0438f0 100644 --- a/plugins/opencode/scripts/lib/opencode-server.mjs +++ b/plugins/opencode/scripts/lib/opencode-server.mjs @@ -2,7 +2,12 @@ // Unlike codex-plugin-cc which uses JSON-RPC over stdin/stdout, // OpenCode exposes a REST API + SSE. This module wraps that API. -import { spawn } from "node:child_process"; +import { spawn, spawnSync } from "node:child_process"; + +// Re-export for spec-compliance / discoverability: probeSessionTerminal lives +// in auto-heal.mjs because it is tightly coupled to heal-decision logic, but +// conceptually it is a server probe. +export { probeSessionTerminal } from "./auto-heal.mjs"; const DEFAULT_PORT = 4096; const DEFAULT_HOST = "127.0.0.1"; @@ -10,9 +15,69 @@ const SERVER_START_TIMEOUT = 30_000; // Long-running tasks (e.g. engine builds, large refactors) can easily exceed // the old 5-10 min caps, causing `fetch failed` at a fixed deadline. Default -// to 30 min; override via env for even longer workloads. +// PROMPT_TIMEOUT_MS to 4 hours — absolute safety cap. Real stall detection +// lives in the watcher via IDLE_TIMEOUT_MS + pgrep child-process check. const REQUEST_TIMEOUT_MS = Number(process.env.OPENCODE_REQUEST_TIMEOUT_MS) || 1_800_000; -const PROMPT_TIMEOUT_MS = Number(process.env.OPENCODE_PROMPT_TIMEOUT_MS) || 1_800_000; +const PROMPT_TIMEOUT_MS = Number(process.env.OPENCODE_PROMPT_TIMEOUT_MS) || 14_400_000; +// How long a session may go without ANY activity signal before we assume it +// is stuck. Activity = new message, new parts, tool output growth, status +// change. Default 15 min — long enough for most silent-but-alive tasks. +const IDLE_TIMEOUT_MS = Number(process.env.OPENCODE_IDLE_TIMEOUT_MS) || 900_000; +// Bash-tool "no child process" consecutive-miss threshold. If the latest +// tool is a bash in status=running but opencode serve has zero child +// processes for N polls in a row, declare stuck. 3 × 5s = 15s grace. +const PGREP_MISS_THRESHOLD = Number(process.env.OPENCODE_PGREP_MISS_THRESHOLD) || 3; + +const IS_WINDOWS = process.platform === "win32"; + +/** + * Find the PID of `opencode serve` listening on `port`, if we can. + * Returns null on Windows or any detection failure (caller degrades gracefully). + */ +function resolveServePid(port) { + if (IS_WINDOWS) return null; + try { + // macOS + Linux: lsof works the same way. Short timeout so we never block + // the watcher loop if the tool is slow/missing. + const r = spawnSync("lsof", ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN"], { + encoding: "utf8", + timeout: 2000, + }); + if (r.status !== 0 || !r.stdout) return null; + const lines = r.stdout.split("\n").slice(1).filter(Boolean); + for (const line of lines) { + const cols = line.trim().split(/\s+/); + const pid = Number(cols[1]); + if (Number.isInteger(pid) && pid > 0) return pid; + } + } catch { + // lsof missing or errored — degrade to no pgrep checks + } + return null; +} + +/** + * Count direct child processes of `pid`. Returns: + * -1 — feature unavailable (Windows, pgrep missing, etc.) — caller should skip check + * 0 — no children + * >0 — that many children + */ +function countChildren(pid) { + if (!pid || IS_WINDOWS) return -1; + try { + const r = spawnSync("pgrep", ["-P", String(pid)], { + encoding: "utf8", + timeout: 2000, + }); + if (r.error) return -1; + // pgrep exits 1 when no matches (empty stdout) — that's a real "zero", not a failure + const out = (r.stdout || "").trim(); + if (!out) return 0; + return out.split("\n").filter(Boolean).length; + } catch { + return -1; + } +} /** * Check if an OpenCode server is already running on the given port. @@ -179,6 +244,21 @@ export function createClient(baseUrl, opts = {}) { // Wait briefly so the new generation has a chance to start and we // don't latch onto a stale completed message from before this prompt. await new Promise((r) => setTimeout(r, MIN_POLL_DELAY_MS)); + + // Resolve the opencode serve PID once so we can check for child + // processes later. If this fails (Windows, no lsof, permissions) + // we silently skip the pgrep-based stuck detector — idle timeout + // still covers most cases. + const urlObj = (() => { + try { return new URL(baseUrl); } catch { return null; } + })(); + const port = Number(urlObj?.port) || DEFAULT_PORT; + const opencodePid = resolveServePid(port); + + let prevSig = ""; + let lastActivityMs = Date.now(); + let pgrepMissCount = 0; + while (!ac.signal.aborted) { try { const params = new URLSearchParams({ limit: "1" }); @@ -190,8 +270,30 @@ export function createClient(baseUrl, opts = {}) { const arr = await r.json(); const last = Array.isArray(arr) ? arr[arr.length - 1] : null; const info = last?.info; - // Only treat assistant messages created *after* this prompt - // started as a completion signal for this call. + const parts = Array.isArray(last?.parts) ? last.parts : []; + // Most recent tool part — the one actually "running" if any. + let lastTool = null; + for (let i = parts.length - 1; i >= 0; i--) { + if (parts[i]?.type === "tool") { lastTool = parts[i]; break; } + } + + // Activity signature: any change here = progress was made. + const sig = JSON.stringify({ + mid: info?.id, + created: info?.time?.created, + completed: info?.time?.completed, + parts: parts.length, + tStatus: lastTool?.state?.status, + tOutLen: (lastTool?.state?.output || "").length, + }); + if (sig !== prevSig) { + lastActivityMs = Date.now(); + prevSig = sig; + pgrepMissCount = 0; + } + + // Completion signal: assistant message created after our prompt + // started, with a terminal `finish` field populated. if ( info && info.role === "assistant" && @@ -201,23 +303,88 @@ export function createClient(baseUrl, opts = {}) { ) { return { source: "watcher", data: last }; } + + // Bash-tool stuck detector: latest tool is bash in status=running + // but opencode serve has zero children for N consecutive polls. + // This is the signature of the "ask permission deadlock" bug + // (sst/opencode#14473): the shell process already exited cleanly + // but tool state never flipped to completed. + if ( + opencodePid && + lastTool?.tool === "bash" && + lastTool?.state?.status === "running" + ) { + const n = countChildren(opencodePid); + if (n === 0) { + pgrepMissCount += 1; + if (pgrepMissCount >= PGREP_MISS_THRESHOLD) { + ac.abort( + new Error( + `bash tool stuck — opencode serve (pid ${opencodePid}) has no child for ${pgrepMissCount} polls while tool.status=running`, + ), + ); + throw new Error("bash tool stuck (no child)"); + } + } else if (n > 0) { + pgrepMissCount = 0; + } + // n === -1 → feature unavailable, don't count either way + } + + // Idle timeout: nothing happened in the session for too long. + // Covers all tool types (not just bash), including non-pgrep + // platforms (Windows). + const idleMs = Date.now() - lastActivityMs; + if (idleMs > IDLE_TIMEOUT_MS) { + ac.abort( + new Error( + `session idle ${Math.floor(idleMs / 1000)}s > ${IDLE_TIMEOUT_MS / 1000}s`, + ), + ); + throw new Error("session idle timeout"); + } } - } catch { - // Ignore transient poll errors; keep waiting. + } catch (err) { + // If we aborted above, propagate so the outer race sees a failure. + if (ac.signal.aborted) throw err; + // Otherwise it's a transient network/server blip — keep polling. } await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS)); } throw new Error("watcher aborted"); })(); + // Settle-wrap each so a single rejection doesn't lose the other side. + // Server-side 5-min POST cap means fetchPromise often rejects LONG + // before the agent is actually done; we must still wait on the watcher. + const wrap = (p, via) => + p.then( + (v) => ({ ok: true, via, data: v.data }), + (err) => ({ ok: false, via, err }), + ); + const runFetch = wrap(fetchPromise, "fetch"); + const runWatcher = wrap(watcherPromise, "watcher"); + try { - const winner = await Promise.race([fetchPromise, watcherPromise]); - // Whichever arrived first, cancel the other. + const first = await Promise.race([runFetch, runWatcher]); + if (first.ok) { + ac.abort(); + fetchPromise.catch(() => {}); + watcherPromise.catch(() => {}); + return first.data; + } + // First to settle was a failure — the other promise may still succeed. + // Do NOT abort yet: in particular, the watcher needs to keep polling + // when the POST was killed by the server's 5-min cap but generation + // is still running. + const second = first.via === "fetch" ? await runWatcher : await runFetch; ac.abort(); - // Swallow the loser's rejection to avoid unhandled rejection noise. fetchPromise.catch(() => {}); watcherPromise.catch(() => {}); - return winner.data; + if (second.ok) return second.data; + // Both failed — surface the more informative error. Prefer the + // fetch error because it usually has the HTTP status/body. + throw first.via === "fetch" ? first.err : second.err; } finally { clearTimeout(timeoutId); }