From bd11ee58268cfcbe83d8b1bdb84f819037425a67 Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 07:15:00 +0800
Subject: [PATCH 1/7] feat: auto-start Monitor on rescue dispatch via
 PostToolUse hook

When Claude dispatches an opencode rescue task (via Agent tool or direct
companion Bash call), this hook detects the new task-xxx id in the tool
response and injects a system-reminder instructing Claude to start a
persistent Monitor covering that id. On terminal states the Monitor
emits a READY line pointing to the companion result command so Claude
fetches the full payload and summarizes it for the user without needing
to be asked.

- New plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
- hooks.json: register PostToolUse (matcher: Agent|Bash, timeout 5s)

Gracefully no-ops on non-matching tool output or missing companion markers.
---
 plugins/opencode/hooks/hooks.json             |  12 ++
 .../scripts/post-tool-use-monitor-hook.mjs    | 159 ++++++++++++++++++
 2 files changed, 171 insertions(+)
 create mode 100644 plugins/opencode/scripts/post-tool-use-monitor-hook.mjs

diff --git a/plugins/opencode/hooks/hooks.json b/plugins/opencode/hooks/hooks.json
index c76f993..8c193dd 100644
--- a/plugins/opencode/hooks/hooks.json
+++ b/plugins/opencode/hooks/hooks.json
@@ -33,6 +33,18 @@
           }
         ]
       }
+    ],
+    "PostToolUse": [
+      {
+        "matcher": "Agent|Bash",
+        "hooks": [
+          {
+            "type": "command",
+            "command": "node \"${CLAUDE_PLUGIN_ROOT}/scripts/post-tool-use-monitor-hook.mjs\"",
+            "timeout": 5
+          }
+        ]
+      }
     ]
   }
 }
diff --git a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
new file mode 100644
index 0000000..95d856d
--- /dev/null
+++ b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
@@ -0,0 +1,159 @@
+#!/usr/bin/env node
+
+// PostToolUse hook: watches for rescue-task dispatch in tool responses and
+// injects a reminder that tells Claude to (a) start/refresh a Monitor
+// covering the new task id(s), and (b) fetch + summarize the companion
+// `result` payload when Monitor reports a terminal state.
+//
+// Why in a hook: the main Claude thread has no built-in way to observe
+// background codex/opencode tasks. Without this, dispatching a rescue is
+// fire-and-forget — the user has to ask for progress manually. The hook
+// makes every rescue dispatch automatically get monitored and reported
+// on, matching the UX of in-process subagents.
+
+import fs from "node:fs";
+import path from "node:path";
+import process from "node:process";
+import { fileURLToPath } from "node:url";
+
+function readHookInput() {
+  try {
+    const raw = fs.readFileSync(0, "utf8").trim();
+    if (!raw) return {};
+    return JSON.parse(raw);
+  } catch {
+    return {};
+  }
+}
+
+// Companion task ids look like `task-moNNNNNN-NNNNNN`.
+const TASK_ID_RE = /\btask-[a-z0-9]{6,}-[a-z0-9]{4,}\b/g;
+
+// Only react to responses that are unambiguously from the opencode companion,
+// to avoid false positives on arbitrary text containing a task-like token.
+const OPENCODE_MARKERS = [
+  /OpenCode task started/i,
+  /opencode-companion\.mjs/,
+  /opencode:opencode-rescue/,
+  /opencode rescue/i,
+];
+
+function extractResponseText(response) {
+  if (response == null) return "";
+  if (typeof response === "string") return response;
+  if (typeof response === "object") {
+    if (typeof response.result === "string") return response.result;
+    if (typeof response.content === "string") return response.content;
+    return JSON.stringify(response);
+  }
+  return String(response);
+}
+
+function resolveCompanionPath() {
+  const here = fileURLToPath(import.meta.url);
+  return path.join(path.dirname(here), "opencode-companion.mjs");
+}
+
+function buildMonitorScript(ids, companionPath) {
+  const quoted = ids.map((id) => `"${id}"`).join(" ");
+  // The poll loop:
+  //  - reads companion status JSON per id every 30s
+  //  - emits a single line whenever status/phase changes
+  //  - exits the loop as soon as every tracked id is terminal so the
+  //    Monitor process ends cleanly; the main thread's Monitor tool sees
+  //    exit and stops spawning events.
+  //
+  // stdout is the event stream — stay selective. On terminal states we
+  // emit a `READY: <cmd>` line so Claude knows the exact command to run
+  // to fetch the full result payload.
+  return [
+    "set -u",
+    `COMP=${JSON.stringify(companionPath)}`,
+    `IDS=(${quoted})`,
+    "declare -A prev",
+    'for id in "${IDS[@]}"; do prev[$id]=""; done',
+    "while true; do",
+    "  all_done=1",
+    '  for id in "${IDS[@]}"; do',
+    '    json=$(node "$COMP" status "$id" --json 2>/dev/null || printf "{}")',
+    "    st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}}))')",
+    '    if [ "$st" != "${prev[$id]}" ]; then',
+    '      ts=$(date +%H:%M:%S)',
+    '      echo "[$ts] opencode $id: ${prev[$id]:-none} → $st"',
+    '      case "$st" in',
+    "        completed/*|failed/*|cancelled/*)",
+    `          echo "[$ts] READY opencode $id: run \\\"node $COMP result $id\\\" to fetch the full report"`,
+    "          ;;",
+    "      esac",
+    '      prev[$id]="$st"',
+    "    fi",
+    '    case "$st" in completed/*|failed/*|cancelled/*) ;; *) all_done=0 ;; esac',
+    "  done",
+    "  if [ $all_done -eq 1 ]; then",
+    "    echo \"[$(date +%H:%M:%S)] opencode: all tracked tasks terminal — exiting monitor\"",
+    "    break",
+    "  fi",
+    "  sleep 30",
+    "done",
+  ].join("\n");
+}
+
+function buildReminder(ids, script) {
+  const idList = ids.join(", ");
+  return [
+    "<opencode-rescue-auto-monitor>",
+    `You just dispatched ${ids.length} opencode rescue task(s): ${idList}.`,
+    "",
+    "Before you respond to the user, start a persistent Monitor so this session can observe task completion automatically. Use the `Monitor` tool:",
+    "",
+    '- description: "opencode rescue task state"',
+    "- persistent: true",
+    "- timeout_ms: 3600000",
+    "- command: the shell loop below, verbatim",
+    "",
+    "```bash",
+    script,
+    "```",
+    "",
+    "Behavior contract — follow this without being asked:",
+    "1. If a Monitor is already running in this session covering other task ids, stop it (`TaskStop`) and start a new one whose IDS array is the union of all currently-running rescue task ids. One Monitor per session.",
+    "2. When a Monitor event reports a task transitioning to `completed/…`, `failed/…`, or `cancelled/…`, immediately run the `READY` line's command (`node <companion> result <task-id>`) in Bash, read the output, and surface a short summary of what the rescue task did (or failed at) to the user. Do not wait for the user to ask.",
+    "3. If the user explicitly asked you to skip monitoring, do nothing here.",
+    "</opencode-rescue-auto-monitor>",
+  ].join("\n");
+}
+
+function main() {
+  const input = readHookInput();
+  const toolName = input.tool_name || "";
+  // Rescue dispatch happens through the Agent tool wrapper; main thread may
+  // also call companion directly via Bash. Ignore other tools.
+  if (toolName !== "Agent" && toolName !== "Bash") return;
+
+  const response = extractResponseText(input.tool_response);
+  if (!response) return;
+  if (!OPENCODE_MARKERS.some((r) => r.test(response))) return;
+
+  const ids = [...new Set(response.match(TASK_ID_RE) || [])];
+  if (ids.length === 0) return;
+
+  const companionPath = resolveCompanionPath();
+  const script = buildMonitorScript(ids, companionPath);
+  const additionalContext = buildReminder(ids, script);
+
+  process.stdout.write(
+    JSON.stringify({
+      hookSpecificOutput: {
+        hookEventName: "PostToolUse",
+        additionalContext,
+      },
+    }),
+  );
+}
+
+try {
+  main();
+} catch {
+  // Best-effort — never block tool use on hook failure.
+  process.exit(0);
+}

From 02160b84477928e3fa4327432ec997b5c5468283 Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 07:26:58 +0800
Subject: [PATCH 2/7] fix(monitor): inline result fetch + fix parse-status JS
 syntax

On terminal state the Monitor script now calls companion result <id> and
emits the truncated summary inline (bounded by OPENCODE_MONITOR_RESULT_CHARS,
default 1500). Claude sees the result summary directly in the Monitor
event and no longer needs a follow-up Bash call.

Also fixes an extra trailing ) in the inline node -e expression that
would have caused the status parser to syntax-error at runtime.
---
 .../scripts/post-tool-use-monitor-hook.mjs    | 32 ++++++++++++-------
 1 file changed, 20 insertions(+), 12 deletions(-)

diff --git a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
index 95d856d..18a1766 100644
--- a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
+++ b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
@@ -56,33 +56,41 @@ function resolveCompanionPath() {
 
 function buildMonitorScript(ids, companionPath) {
   const quoted = ids.map((id) => `"${id}"`).join(" ");
-  // The poll loop:
-  //  - reads companion status JSON per id every 30s
+  // The poll loop runs inside a Monitor child process:
+  //  - polls companion status JSON per id every 30s
   //  - emits a single line whenever status/phase changes
-  //  - exits the loop as soon as every tracked id is terminal so the
-  //    Monitor process ends cleanly; the main thread's Monitor tool sees
-  //    exit and stops spawning events.
-  //
-  // stdout is the event stream — stay selective. On terminal states we
-  // emit a `READY: <cmd>` line so Claude knows the exact command to run
-  // to fetch the full result payload.
+  //  - on terminal state, fetches `companion result <id>`, truncates to
+  //    a bounded size, and prints it as multi-line output; Monitor batches
+  //    lines within ~200ms into one notification, so the main thread
+  //    sees a single event carrying "task done + full summary" without
+  //    needing a follow-up tool call to fetch the result
+  //  - exits when every tracked id is terminal (Monitor process ends
+  //    cleanly, no runaway background poller)
   return [
     "set -u",
     `COMP=${JSON.stringify(companionPath)}`,
     `IDS=(${quoted})`,
+    "RESULT_MAX_CHARS=${OPENCODE_MONITOR_RESULT_CHARS:-1500}",
     "declare -A prev",
     'for id in "${IDS[@]}"; do prev[$id]=""; done',
     "while true; do",
     "  all_done=1",
     '  for id in "${IDS[@]}"; do',
     '    json=$(node "$COMP" status "$id" --json 2>/dev/null || printf "{}")',
-    "    st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}}))')",
+    "    st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}})')",
     '    if [ "$st" != "${prev[$id]}" ]; then',
     '      ts=$(date +%H:%M:%S)',
     '      echo "[$ts] opencode $id: ${prev[$id]:-none} → $st"',
     '      case "$st" in',
     "        completed/*|failed/*|cancelled/*)",
-    `          echo "[$ts] READY opencode $id: run \\\"node $COMP result $id\\\" to fetch the full report"`,
+    '          result=$(node "$COMP" result "$id" 2>/dev/null || true)',
+    "          # Truncate defensively so Monitor output stays bounded.",
+    '          summary=$(printf "%s" "$result" | head -c "$RESULT_MAX_CHARS")',
+    '          echo "[$ts] opencode $id TERMINAL=$st — result summary:"',
+    '          echo "--- result-begin $id ---"',
+    '          printf "%s" "$summary"',
+    '          echo ""',
+    '          echo "--- result-end $id ---"',
     "          ;;",
     "      esac",
     '      prev[$id]="$st"',
@@ -117,7 +125,7 @@ function buildReminder(ids, script) {
     "",
     "Behavior contract — follow this without being asked:",
     "1. If a Monitor is already running in this session covering other task ids, stop it (`TaskStop`) and start a new one whose IDS array is the union of all currently-running rescue task ids. One Monitor per session.",
-    "2. When a Monitor event reports a task transitioning to `completed/…`, `failed/…`, or `cancelled/…`, immediately run the `READY` line's command (`node <companion> result <task-id>`) in Bash, read the output, and surface a short summary of what the rescue task did (or failed at) to the user. Do not wait for the user to ask.",
+    "2. The Monitor script above already fetches `companion result <id>` and emits the summary inline on terminal state — each terminal event carries the full result block between `--- result-begin ---` and `--- result-end ---` markers. You do NOT need to run a follow-up Bash call to get the result; just read the Monitor event and surface a short summary of what the rescue task did (or failed at) to the user.",
     "3. If the user explicitly asked you to skip monitoring, do nothing here.",
     "</opencode-rescue-auto-monitor>",
   ].join("\n");

From 31febd0b042ee20e50cc7b90d55c185323936c29 Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 07:31:23 +0800
Subject: [PATCH 3/7] fix(server): race sendPrompt against completion watcher +
 env-tunable timeouts

OpenCode server's POST /session/:id/message occasionally fails to close
its HTTP response after the session emits the terminal assistant message
(observed with glm-5 backend, opencode 1.4.x). Without this fix,
sendPrompt hangs until AbortSignal fires, leaving the companion job
stuck in 'investigating' status until the (previously 5 min) timeout.

Changes:
- Race the POST fetch against a /session/:id/message polling watcher;
  whichever returns first aborts the other. Watcher only accepts a
  completion whose info.time.completed >= prompt startedAt.
- Bump generic request() timeout and sendPrompt timeout to 30 min,
  configurable via OPENCODE_REQUEST_TIMEOUT_MS / OPENCODE_PROMPT_TIMEOUT_MS
  env vars.
- Completion poll interval configurable via OPENCODE_COMPLETION_POLL_MS
  (default 5s).
---
 .../opencode/scripts/lib/opencode-server.mjs  | 96 ++++++++++++++++---
 1 file changed, 84 insertions(+), 12 deletions(-)

diff --git a/plugins/opencode/scripts/lib/opencode-server.mjs b/plugins/opencode/scripts/lib/opencode-server.mjs
index f4192a8..1d32943 100644
--- a/plugins/opencode/scripts/lib/opencode-server.mjs
+++ b/plugins/opencode/scripts/lib/opencode-server.mjs
@@ -8,6 +8,12 @@ const DEFAULT_PORT = 4096;
 const DEFAULT_HOST = "127.0.0.1";
 const SERVER_START_TIMEOUT = 30_000;
 
+// Long-running tasks (e.g. engine builds, large refactors) can easily exceed
+// the old 5-10 min caps, causing `fetch failed` at a fixed deadline. Default
+// to 30 min; override via env for even longer workloads.
+const REQUEST_TIMEOUT_MS = Number(process.env.OPENCODE_REQUEST_TIMEOUT_MS) || 1_800_000;
+const PROMPT_TIMEOUT_MS = Number(process.env.OPENCODE_PROMPT_TIMEOUT_MS) || 1_800_000;
+
 /**
  * Check if an OpenCode server is already running on the given port.
  * @param {string} host
@@ -87,7 +93,7 @@ export function createClient(baseUrl, opts = {}) {
       method,
       headers,
       body: body != null ? JSON.stringify(body) : undefined,
-      signal: AbortSignal.timeout(300_000),
+      signal: AbortSignal.timeout(REQUEST_TIMEOUT_MS),
     });
     if (!res.ok) {
       const text = await res.text().catch(() => "");
@@ -127,6 +133,17 @@ export function createClient(baseUrl, opts = {}) {
     /**
      * Send a prompt (synchronous / streaming).
      * Returns the full response text from SSE stream.
+     *
+     * NOTE: OpenCode's POST /session/:id/message occasionally fails to close
+     * its HTTP response body after the session emits its terminal assistant
+     * message (observed against glm-5 backend, opencode 1.4.x). Relying on
+     * res.json() alone means the caller hangs until AbortSignal fires, which
+     * breaks downstream job-completion detection in the companion.
+     *
+     * Workaround: race the fetch against a session-completion watcher that
+     * polls GET /session/:id/message. When the latest assistant message has
+     * info.time.completed set AND finish !== undefined, the session is done;
+     * we abort the hanging fetch and synthesize the response from the poll.
      */
     sendPrompt: async (sessionId, promptText, opts = {}) => {
       const body = {
@@ -136,19 +153,74 @@ export function createClient(baseUrl, opts = {}) {
       if (opts.model) body.model = opts.model;
       if (opts.system) body.system = opts.system;
 
-      const res = await fetch(`${baseUrl}/session/${sessionId}/message`, {
-        method: "POST",
-        headers,
-        body: JSON.stringify(body),
-        signal: AbortSignal.timeout(600_000), // 10 min for long tasks
-      });
+      const ac = new AbortController();
+      const timeoutId = setTimeout(() => ac.abort(new Error("prompt timeout")), PROMPT_TIMEOUT_MS);
+      const startedAt = Date.now();
+      // Grace period so we don't mistake "session had no prior activity" for
+      // completion before the new prompt has even begun generating.
+      const MIN_POLL_DELAY_MS = 5_000;
+      const POLL_INTERVAL_MS = Number(process.env.OPENCODE_COMPLETION_POLL_MS) || 5_000;
 
-      if (!res.ok) {
-        const text = await res.text().catch(() => "");
-        throw new Error(`OpenCode prompt failed ${res.status}: ${text}`);
-      }
+      const fetchPromise = (async () => {
+        const res = await fetch(`${baseUrl}/session/${sessionId}/message`, {
+          method: "POST",
+          headers,
+          body: JSON.stringify(body),
+          signal: ac.signal,
+        });
+        if (!res.ok) {
+          const text = await res.text().catch(() => "");
+          throw new Error(`OpenCode prompt failed ${res.status}: ${text}`);
+        }
+        return { source: "fetch", data: await res.json() };
+      })();
 
-      return res.json();
+      const watcherPromise = (async () => {
+        // Wait briefly so the new generation has a chance to start and we
+        // don't latch onto a stale completed message from before this prompt.
+        await new Promise((r) => setTimeout(r, MIN_POLL_DELAY_MS));
+        while (!ac.signal.aborted) {
+          try {
+            const params = new URLSearchParams({ limit: "1" });
+            const r = await fetch(
+              `${baseUrl}/session/${sessionId}/message?${params.toString()}`,
+              { headers, signal: AbortSignal.timeout(10_000) },
+            );
+            if (r.ok) {
+              const arr = await r.json();
+              const last = Array.isArray(arr) ? arr[arr.length - 1] : null;
+              const info = last?.info;
+              // Only treat assistant messages created *after* this prompt
+              // started as a completion signal for this call.
+              if (
+                info &&
+                info.role === "assistant" &&
+                typeof info.time?.completed === "number" &&
+                info.time.completed >= startedAt &&
+                typeof info.finish === "string"
+              ) {
+                return { source: "watcher", data: last };
+              }
+            }
+          } catch {
+            // Ignore transient poll errors; keep waiting.
+          }
+          await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
+        }
+        throw new Error("watcher aborted");
+      })();
+
+      try {
+        const winner = await Promise.race([fetchPromise, watcherPromise]);
+        // Whichever arrived first, cancel the other.
+        ac.abort();
+        // Swallow the loser's rejection to avoid unhandled rejection noise.
+        fetchPromise.catch(() => {});
+        watcherPromise.catch(() => {});
+        return winner.data;
+      } finally {
+        clearTimeout(timeoutId);
+      }
     },
 
     /**

From 6d48b0fb274dbeca1f6a963a2a66a163187196b6 Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 08:34:17 +0800
Subject: [PATCH 4/7] fix(status): honor --json flag and single-task lookup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

`status` handler was ignoring argv entirely — `--json` was silently
dropped and positional task ids were never matched. Tooling that piped
status through jq would choke on the markdown fallback with "parse
error: Invalid numeric literal".

Now:
- `status --json` emits a workspace snapshot as JSON ({workspaceRoot,
  running, latestFinished, recent})
- `status <tid> [--json]` looks up a single job by id/prefix. JSON
  form is {workspaceRoot, job: <enriched|null>} so callers can always
  read .job.status safely.
- `status --all` widens from session-scoped to all-sessions (useful
  for cross-session observers like monitor scripts)
- Markdown output unchanged for the no-flag case.
---
 .../opencode/scripts/opencode-companion.mjs   | 53 ++++++++++++++++++-
 1 file changed, 51 insertions(+), 2 deletions(-)

diff --git a/plugins/opencode/scripts/opencode-companion.mjs b/plugins/opencode/scripts/opencode-companion.mjs
index 48e526b..7d57a19 100644
--- a/plugins/opencode/scripts/opencode-companion.mjs
+++ b/plugins/opencode/scripts/opencode-companion.mjs
@@ -13,7 +13,7 @@ import { isOpencodeInstalled, getOpencodeVersion, spawnDetached } from "./lib/pr
 import { isServerRunning, ensureServer, createClient, connect } from "./lib/opencode-server.mjs";
 import { resolveWorkspace } from "./lib/workspace.mjs";
 import { loadState, updateState, upsertJob, generateJobId, jobDataPath } from "./lib/state.mjs";
-import { buildStatusSnapshot, resolveResultJob, resolveCancelableJob, enrichJob } from "./lib/job-control.mjs";
+import { buildStatusSnapshot, resolveResultJob, resolveCancelableJob, enrichJob, matchJobReference } from "./lib/job-control.mjs";
 import { createJobRecord, runTrackedJob, getClaudeSessionId } from "./lib/tracked-jobs.mjs";
 import { renderStatus, renderResult, renderReview, renderSetup } from "./lib/render.mjs";
 import { buildReviewPrompt, buildTaskPrompt } from "./lib/prompts.mjs";
@@ -424,11 +424,60 @@ async function handleTaskResumeCandidate(argv) {
 // ------------------------------------------------------------------
 
 async function handleStatus(argv) {
+  const { options, positional } = parseArgs(argv ?? [], {
+    booleanOptions: ["json", "all"],
+  });
+
   const workspace = await resolveWorkspace();
   const state = loadState(workspace);
   const sessionId = getClaudeSessionId();
+  const jobs = state.jobs ?? [];
+  const wantJson = !!options.json;
+  // --all widens the snapshot filter to every session's jobs; without --all we
+  // still filter to the current Claude session for the existing markdown UX.
+  const sessionFilter = options.all ? undefined : sessionId;
+  const ref = positional?.[0];
+
+  // Single-task query — `status <tid> [--json]`.
+  if (ref) {
+    const { job, ambiguous } = matchJobReference(jobs, ref);
+    if (ambiguous) {
+      if (wantJson) {
+        console.log(JSON.stringify({ workspaceRoot: workspace, job: null, error: "ambiguous" }));
+      } else {
+        console.error(`Ambiguous job reference "${ref}". Please provide a more specific ID prefix.`);
+      }
+      process.exit(ambiguous ? 2 : 0);
+      return;
+    }
+    if (wantJson) {
+      const enriched = job ? enrichJob(job, workspace) : null;
+      console.log(JSON.stringify({ workspaceRoot: workspace, job: enriched }));
+      return;
+    }
+    if (!job) {
+      console.log(`No job found for "${ref}" in workspace ${workspace}.`);
+      return;
+    }
+    console.log(renderStatus({ running: [], latestFinished: null, recent: [enrichJob(job, workspace)] }));
+    return;
+  }
+
+  const snapshot = buildStatusSnapshot(jobs, workspace, { sessionId: sessionFilter });
+
+  if (wantJson) {
+    // Machine-readable shape mirrors the single-task case so callers can treat
+    // both uniformly: a `.job` field is present for single-task, otherwise
+    // `.running`/`.recent` arrays describe the whole workspace snapshot.
+    console.log(JSON.stringify({
+      workspaceRoot: workspace,
+      running: snapshot.running,
+      latestFinished: snapshot.latestFinished,
+      recent: snapshot.recent,
+    }));
+    return;
+  }
 
-  const snapshot = buildStatusSnapshot(state.jobs ?? [], workspace, { sessionId });
   console.log(renderStatus(snapshot));
 }
 

From c378dfdfc4d6e0abf2a514fd16f3616994f74b77 Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 09:27:33 +0800
Subject: [PATCH 5/7] feat(monitor): surface progress activity and heartbeat
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Previously the Monitor script only emitted on status/phase transitions.
For long-running tasks that sit in 'running/investigating' for many
minutes, the user saw one initial event and then nothing — no way to
tell if the task was still alive.

Now:
- Include the last line of progressPreview in the state signature so
  any new log activity inside the task triggers an event (with elapsed
  time + latest log snippet)
- Emit a heartbeat every HEARTBEAT_POLLS ticks (default 10 = ~5min)
  with current status/phase/elapsed even when nothing has changed
- Both tunable via OPENCODE_MONITOR_HEARTBEAT_POLLS env var
---
 .../scripts/post-tool-use-monitor-hook.mjs    | 69 ++++++++++++-------
 1 file changed, 43 insertions(+), 26 deletions(-)

diff --git a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
index 18a1766..77900a1 100644
--- a/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
+++ b/plugins/opencode/scripts/post-tool-use-monitor-hook.mjs
@@ -58,44 +58,61 @@ function buildMonitorScript(ids, companionPath) {
   const quoted = ids.map((id) => `"${id}"`).join(" ");
   // The poll loop runs inside a Monitor child process:
   //  - polls companion status JSON per id every 30s
-  //  - emits a single line whenever status/phase changes
-  //  - on terminal state, fetches `companion result <id>`, truncates to
-  //    a bounded size, and prints it as multi-line output; Monitor batches
-  //    lines within ~200ms into one notification, so the main thread
-  //    sees a single event carrying "task done + full summary" without
-  //    needing a follow-up tool call to fetch the result
-  //  - exits when every tracked id is terminal (Monitor process ends
-  //    cleanly, no runaway background poller)
+  //  - emits an event when status/phase OR the latest progressPreview log
+  //    line changes, so long-running tasks surface intermediate activity
+  //  - emits a heartbeat every HEARTBEAT_POLLS ticks (default 10 = ~5min)
+  //    so the user sees signs of life even when nothing has changed
+  //  - on terminal state, fetches `companion result <id>`, truncates, and
+  //    prints a multi-line summary so the main thread gets a single batched
+  //    event carrying the full report
+  //  - exits when every tracked id is terminal
   return [
     "set -u",
     `COMP=${JSON.stringify(companionPath)}`,
     `IDS=(${quoted})`,
     "RESULT_MAX_CHARS=${OPENCODE_MONITOR_RESULT_CHARS:-1500}",
+    "HEARTBEAT_POLLS=${OPENCODE_MONITOR_HEARTBEAT_POLLS:-10}",
     "declare -A prev",
-    'for id in "${IDS[@]}"; do prev[$id]=""; done',
+    "declare -A hb",
+    'for id in "${IDS[@]}"; do prev[$id]=""; hb[$id]=0; done',
     "while true; do",
     "  all_done=1",
     '  for id in "${IDS[@]}"; do',
     '    json=$(node "$COMP" status "$id" --json 2>/dev/null || printf "{}")',
-    "    st=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);console.log((j.job?.status||\"unknown\")+\"/\"+(j.job?.phase||\"\"))}catch(e){console.log(\"parse-err/\")}})')",
-    '    if [ "$st" != "${prev[$id]}" ]; then',
+    "    fields=$(printf '%s' \"$json\" | node -e 'let s=\"\";process.stdin.on(\"data\",d=>s+=d).on(\"end\",()=>{try{const j=JSON.parse(s);const jb=j.job||{};const prog=String(jb.progressPreview||\"\").split(\"\\n\").filter(Boolean);const last=(prog[prog.length-1]||\"\").replace(/[|\\r\\n]/g,\" \").slice(0,200);process.stdout.write([jb.status||\"unknown\",jb.phase||\"\",jb.elapsed||\"\",last].join(\"|\"))}catch(e){process.stdout.write(\"parse-err|||\")}})')",
+    "    IFS='|' read -r st phase elapsed last <<< \"$fields\"",
+    '    sig="${st}/${phase}|${last}"',
+    '    if [ "$sig" != "${prev[$id]}" ]; then',
     '      ts=$(date +%H:%M:%S)',
-    '      echo "[$ts] opencode $id: ${prev[$id]:-none} → $st"',
-    '      case "$st" in',
-    "        completed/*|failed/*|cancelled/*)",
-    '          result=$(node "$COMP" result "$id" 2>/dev/null || true)',
-    "          # Truncate defensively so Monitor output stays bounded.",
-    '          summary=$(printf "%s" "$result" | head -c "$RESULT_MAX_CHARS")',
-    '          echo "[$ts] opencode $id TERMINAL=$st — result summary:"',
-    '          echo "--- result-begin $id ---"',
-    '          printf "%s" "$summary"',
-    '          echo ""',
-    '          echo "--- result-end $id ---"',
-    "          ;;",
-    "      esac",
-    '      prev[$id]="$st"',
+    '      if [ -n "$last" ]; then',
+    '        echo "[$ts] opencode $id: $st/$phase (elapsed $elapsed) — $last"',
+    "      else",
+    '        echo "[$ts] opencode $id: $st/$phase (elapsed $elapsed)"',
+    "      fi",
+    '      prev[$id]="$sig"',
+    "      hb[$id]=0",
+    "    else",
+    '      hb[$id]=$(( ${hb[$id]} + 1 ))',
+    '      if [ "${hb[$id]}" -ge "$HEARTBEAT_POLLS" ]; then',
+    '        ts=$(date +%H:%M:%S)',
+    '        echo "[$ts] opencode $id: heartbeat — still $st/$phase (elapsed $elapsed)"',
+    "        hb[$id]=0",
+    "      fi",
     "    fi",
-    '    case "$st" in completed/*|failed/*|cancelled/*) ;; *) all_done=0 ;; esac',
+    '    case "$st" in',
+    "      completed|failed|cancelled)",
+    '        result=$(node "$COMP" result "$id" 2>/dev/null || true)',
+    "        # Truncate defensively so Monitor output stays bounded.",
+    '        summary=$(printf "%s" "$result" | head -c "$RESULT_MAX_CHARS")',
+    '        ts=$(date +%H:%M:%S)',
+    '        echo "[$ts] opencode $id TERMINAL=$st — result summary:"',
+    '        echo "--- result-begin $id ---"',
+    '        printf "%s" "$summary"',
+    '        echo ""',
+    '        echo "--- result-end $id ---"',
+    "        ;;",
+    '      *) all_done=0 ;;',
+    "    esac",
     "  done",
     "  if [ $all_done -eq 1 ]; then",
     "    echo \"[$(date +%H:%M:%S)] opencode: all tracked tasks terminal — exiting monitor\"",

From ec2757353e7e3a9a7b9f9e79a123b88013112566 Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 14:37:20 +0800
Subject: [PATCH 6/7] feat(auto-heal): reconcile stuck jobs via session
 terminal probe

Long-running background tasks occasionally get stuck in investigating
status after the OpenCode session has finished server-side (POST body
never closes, watcher misses the terminal signal, or task-worker dies).

- New lib/auto-heal.mjs probes GET /session/:id/message?limit=1 and
  transitions the local job to completed when the last assistant
  message has info.finish set and info.time.completed >= job.startedAt.
  If the task-worker PID is dead and the session is silent >60s, the
  job is marked failed with a clear reason.
- status, result, and task-resume-candidate run a silent heal pass
  before reading state so they never report a false "running" for a
  session that is actually complete.
- New `companion.mjs heal` subcommand scans and reconciles in bulk,
  with --dry-run / --json / --all flags.
- Heal is a no-op when the server is unreachable, so offline use of
  status/result keeps working.
---
 README.md                                     |  27 ++
 plugins/opencode/scripts/lib/auto-heal.mjs    | 283 ++++++++++++++++++
 .../opencode/scripts/opencode-companion.mjs   |  90 +++++-
 3 files changed, 397 insertions(+), 3 deletions(-)
 create mode 100644 plugins/opencode/scripts/lib/auto-heal.mjs

diff --git a/README.md b/README.md
index 8c14ad3..a18b686 100644
--- a/README.md
+++ b/README.md
@@ -101,6 +101,33 @@ To check your configured providers:
 
 When enabled via `/opencode:setup --enable-review-gate`, a Stop hook runs a targeted OpenCode review on Claude's response. If issues are found, the stop is blocked so Claude can address them first. Warning: can create long-running loops and drain usage limits.
 
+## Job Auto-Heal
+
+Long-running tasks spawned via `/opencode:task --background` occasionally get
+stuck in `investigating` status even after the OpenCode session has finished
+server-side — typically because `POST /session/:id/message` fails to close its
+HTTP body, the task-worker is killed, or the companion's watcher misses the
+terminal signal.
+
+The companion now reconciles this automatically:
+
+- `companion.mjs status` and `companion.mjs result` run a silent auto-heal
+  pass before they read state, so they never report a false "running" state
+  for a session that is actually complete.
+- `companion.mjs heal` scans for stuck jobs and reconciles them in bulk. Pass
+  `--dry-run` to preview, `--json` for machine-readable output, and `--all`
+  to include jobs from other Claude sessions.
+
+Each heal check queries `GET /session/:id/message?limit=1`. If the last
+assistant message has `info.finish` set and `info.time.completed >= job.startedAt`,
+the job is transitioned to `completed` and the message text is persisted to
+the job data file. If the task-worker PID is dead and the session has been
+silent for >60 s, the job is transitioned to `failed` with a clear reason.
+
+If the OpenCode server is unreachable, auto-heal is a no-op — status/result
+commands still work, they just can't move stuck jobs forward until the server
+comes back.
+
 ## Troubleshooting
 
 <details>
diff --git a/plugins/opencode/scripts/lib/auto-heal.mjs b/plugins/opencode/scripts/lib/auto-heal.mjs
new file mode 100644
index 0000000..88a555b
--- /dev/null
+++ b/plugins/opencode/scripts/lib/auto-heal.mjs
@@ -0,0 +1,283 @@
+// Session-level auto-heal for tracked jobs.
+//
+// Background: task-worker subprocesses wrap `client.sendPrompt(sid, ...)` in
+// runTrackedJob so that on successful return the job flips status→completed
+// and the response text is persisted to jobDataPath. But sendPrompt can hang
+// or the worker can be killed before that return happens — even though the
+// OpenCode session itself completed cleanly server-side. The job then stays
+// in a non-terminal state ("investigating"/"running") forever and downstream
+// Monitor scripts never see the true finish.
+//
+// This module provides a best-effort reconciliation pass: given a job with
+// an `opencodeSessionId`, query the OpenCode server for the last assistant
+// message in that session. If it looks terminal (info.finish set and
+// completed >= job.startedAt), upsert the job as completed and persist the
+// text. If the worker process is gone and the session has been idle long
+// enough, mark as failed with a clear error message.
+//
+// All functions are no-ops (or log to stderr and return the original job)
+// when the server is unreachable, so callers can sprinkle autoHealJob at
+// the top of status-reading paths without wrapping in try/catch themselves.
+
+import fs from "node:fs";
+import path from "node:path";
+
+import { ensureDir } from "./fs.mjs";
+import { upsertJob, jobDataPath } from "./state.mjs";
+
+const DEFAULT_BASE_URL = "http://127.0.0.1:4096";
+// A worker/session can be legitimately silent for a while (big model thinking,
+// slow tool) — only declare it dead after >60s of no session activity AND no
+// live task-worker process. 60s matches the spec.
+const STALE_IDLE_MS = 60_000;
+
+function buildHeaders() {
+  const headers = { "Content-Type": "application/json" };
+  if (process.env.OPENCODE_SERVER_PASSWORD) {
+    const user = process.env.OPENCODE_SERVER_USERNAME ?? "opencode";
+    const cred = Buffer.from(`${user}:${process.env.OPENCODE_SERVER_PASSWORD}`).toString("base64");
+    headers["Authorization"] = `Basic ${cred}`;
+  }
+  return headers;
+}
+
+/**
+ * True if the given PID is currently alive. Treats missing/invalid PID as dead.
+ * @param {number|undefined|null} pid
+ * @returns {boolean}
+ */
+export function isProcessAlive(pid) {
+  if (!pid || !Number.isInteger(pid) || pid <= 0) return false;
+  try {
+    // Signal 0 is a permission/existence probe — no signal delivered.
+    process.kill(pid, 0);
+    return true;
+  } catch (err) {
+    // ESRCH = no such process. EPERM = process exists but we can't signal it
+    // (still alive from our perspective).
+    return err.code === "EPERM";
+  }
+}
+
+/**
+ * Extract visible text from an OpenCode message `parts` array.
+ * @param {Array|undefined} parts
+ * @returns {string}
+ */
+function extractPartsText(parts) {
+  if (!Array.isArray(parts)) return "";
+  return parts
+    .filter((p) => p?.type === "text" && typeof p.text === "string")
+    .map((p) => p.text)
+    .join("\n");
+}
+
+/**
+ * Query the opencode server for the terminal state of a session.
+ *
+ * Returns:
+ *   { terminal: true,  finish, completed, text, info } when the last assistant
+ *     message has info.time.completed >= startedAt AND typeof info.finish === 'string'.
+ *   { terminal: false, reachable: true, lastUpdatedAt, lastInfo }           when session exists but no terminal marker.
+ *   { terminal: false, reachable: false, error }                             when server unreachable / errored.
+ *
+ * @param {string} baseUrl
+ * @param {string} sessionId
+ * @param {number} startedAtMs - epoch ms; only treat completions >= this as ours
+ * @param {object} [headers]
+ */
+export async function probeSessionTerminal(baseUrl, sessionId, startedAtMs, headers) {
+  const h = headers ?? buildHeaders();
+  try {
+    // limit=1 → last message only. On glm-5 / opencode 1.4.x this returns
+    // an array of { info, parts } objects.
+    const res = await fetch(`${baseUrl}/session/${sessionId}/message?limit=1`, {
+      method: "GET",
+      headers: h,
+      signal: AbortSignal.timeout(10_000),
+    });
+    if (!res.ok) {
+      return { terminal: false, reachable: true, error: `HTTP ${res.status}` };
+    }
+    const arr = await res.json();
+    const last = Array.isArray(arr) ? arr[arr.length - 1] : null;
+    const info = last?.info;
+    if (!info) {
+      return { terminal: false, reachable: true, lastUpdatedAt: 0, lastInfo: null };
+    }
+
+    const completed = typeof info.time?.completed === "number" ? info.time.completed : 0;
+    const created = typeof info.time?.created === "number" ? info.time.created : 0;
+    const lastUpdatedAt = Math.max(completed, created);
+
+    const looksTerminal =
+      info.role === "assistant" &&
+      typeof info.finish === "string" &&
+      completed >= (startedAtMs || 0);
+
+    if (looksTerminal) {
+      return {
+        terminal: true,
+        finish: info.finish,
+        completed,
+        text: extractPartsText(last.parts),
+        info,
+      };
+    }
+    return { terminal: false, reachable: true, lastUpdatedAt, lastInfo: info };
+  } catch (err) {
+    return { terminal: false, reachable: false, error: err.message };
+  }
+}
+
+/**
+ * Parse an ISO-ish timestamp that might be a number or string. Returns epoch ms, or 0.
+ */
+function toEpochMs(v) {
+  if (v == null) return 0;
+  if (typeof v === "number") return v < 1e12 ? v * 1000 : v; // tolerate seconds
+  const t = new Date(v).getTime();
+  return Number.isFinite(t) ? t : 0;
+}
+
+/**
+ * Attempt to auto-heal a single job. Mutates persistent state via upsertJob
+ * on transitions. Returns the up-to-date job record (healed or not).
+ *
+ * @param {string} workspace
+ * @param {object} job
+ * @param {object} [opts]
+ * @param {string} [opts.baseUrl]
+ * @param {boolean} [opts.dryRun] - when true, do not write state; return `{job, action, details}`
+ */
+export async function autoHealJob(workspace, job, opts = {}) {
+  const baseUrl = opts.baseUrl ?? DEFAULT_BASE_URL;
+  const dryRun = !!opts.dryRun;
+  const HEALABLE = new Set(["starting", "investigating", "running", "finalizing"]);
+
+  if (!job || !job.opencodeSessionId) {
+    return { job, action: "skip", reason: "no opencodeSessionId" };
+  }
+  if (!HEALABLE.has(job.status)) {
+    return { job, action: "skip", reason: `status=${job.status} not healable` };
+  }
+
+  const startedAtMs =
+    toEpochMs(job.startedAt) ||
+    toEpochMs(job.createdAt) ||
+    toEpochMs(job.updatedAt) ||
+    0;
+
+  const probe = await probeSessionTerminal(baseUrl, job.opencodeSessionId, startedAtMs);
+
+  if (probe.terminal) {
+    const completedIso = new Date(probe.completed).toISOString();
+    const summary = (probe.text || "").slice(0, 500);
+    if (dryRun) {
+      return {
+        job,
+        action: "would-complete",
+        details: {
+          finish: probe.finish,
+          completedAt: completedIso,
+          textLen: (probe.text || "").length,
+        },
+      };
+    }
+
+    // Persist the result payload to disk so handleResult can surface it.
+    try {
+      const dataFile = jobDataPath(workspace, job.id);
+      ensureDir(path.dirname(dataFile));
+      const payload = {
+        rendered: probe.text,
+        summary,
+        healed: true,
+        finish: probe.finish,
+      };
+      fs.writeFileSync(dataFile, JSON.stringify(payload, null, 2), "utf8");
+    } catch (err) {
+      // Non-fatal: the status transition below is still useful.
+      process.stderr.write(`auto-heal: failed to write data file for ${job.id}: ${err.message}\n`);
+    }
+
+    upsertJob(workspace, {
+      id: job.id,
+      status: "completed",
+      completedAt: completedIso,
+      phase: "completed",
+      result: summary || job.result || null,
+      healed: true,
+      finish: probe.finish,
+    });
+    return {
+      job: { ...job, status: "completed", completedAt: completedIso, result: summary, healed: true, finish: probe.finish },
+      action: "healed-completed",
+      details: { finish: probe.finish, textLen: (probe.text || "").length },
+    };
+  }
+
+  // Not terminal. Can we at least declare it dead?
+  if (!probe.reachable) {
+    return { job, action: "skip", reason: `server unreachable: ${probe.error}` };
+  }
+
+  const workerAlive = isProcessAlive(job.pid);
+  if (workerAlive) {
+    return { job, action: "skip", reason: "worker still alive" };
+  }
+
+  const lastUpdateMs = probe.lastUpdatedAt || toEpochMs(job.updatedAt);
+  const idleMs = lastUpdateMs ? Date.now() - lastUpdateMs : Infinity;
+  if (idleMs < STALE_IDLE_MS) {
+    return { job, action: "skip", reason: `idle ${Math.floor(idleMs / 1000)}s < ${STALE_IDLE_MS / 1000}s threshold` };
+  }
+
+  const idleSec = Number.isFinite(idleMs) ? Math.floor(idleMs / 1000) : -1;
+  const errMsg = `task-worker exited without completion; session last updated ${idleSec}s ago`;
+
+  if (dryRun) {
+    return { job, action: "would-fail", details: { errorMessage: errMsg } };
+  }
+
+  upsertJob(workspace, {
+    id: job.id,
+    status: "failed",
+    completedAt: new Date().toISOString(),
+    errorMessage: errMsg,
+    healed: true,
+  });
+  return {
+    job: { ...job, status: "failed", errorMessage: errMsg, healed: true },
+    action: "healed-failed",
+    details: { errorMessage: errMsg },
+  };
+}
+
+/**
+ * Auto-heal a list of jobs, returning the (possibly updated) jobs in the same
+ * order, plus a list of heal actions for reporting.
+ *
+ * @param {string} workspace
+ * @param {object[]} jobs
+ * @param {object} [opts]
+ * @returns {Promise<{ jobs: object[], actions: object[] }>}
+ */
+export async function autoHealJobs(workspace, jobs, opts = {}) {
+  const actions = [];
+  const out = [];
+  for (const j of jobs ?? []) {
+    try {
+      const r = await autoHealJob(workspace, j, opts);
+      out.push(r.job ?? j);
+      if (r.action && r.action !== "skip") {
+        actions.push({ id: j.id, action: r.action, details: r.details });
+      }
+    } catch (err) {
+      // Auto-heal must never crash the caller.
+      process.stderr.write(`auto-heal: ${j.id} errored: ${err.message}\n`);
+      out.push(j);
+    }
+  }
+  return { jobs: out, actions };
+}
diff --git a/plugins/opencode/scripts/opencode-companion.mjs b/plugins/opencode/scripts/opencode-companion.mjs
index 7d57a19..eff3b51 100644
--- a/plugins/opencode/scripts/opencode-companion.mjs
+++ b/plugins/opencode/scripts/opencode-companion.mjs
@@ -19,6 +19,7 @@ import { renderStatus, renderResult, renderReview, renderSetup } from "./lib/ren
 import { buildReviewPrompt, buildTaskPrompt } from "./lib/prompts.mjs";
 import { getDiff, getStatus as getGitStatus } from "./lib/git.mjs";
 import { readJson } from "./lib/fs.mjs";
+import { autoHealJob, autoHealJobs } from "./lib/auto-heal.mjs";
 
 const PLUGIN_ROOT = process.env.CLAUDE_PLUGIN_ROOT || path.resolve(import.meta.dirname, "..");
 
@@ -38,6 +39,7 @@ const handlers = {
   status: handleStatus,
   result: handleResult,
   cancel: handleCancel,
+  heal: handleHeal,
 };
 
 const handler = handlers[subcommand];
@@ -397,9 +399,20 @@ async function handleTaskResumeCandidate(argv) {
   const { options } = parseArgs(argv, { booleanOptions: ["json"] });
 
   const workspace = await resolveWorkspace();
-  const state = loadState(workspace);
+  let state = loadState(workspace);
   const sessionId = getClaudeSessionId();
 
+  // Heal first so "latest completed" reflects session reality, not a stale
+  // "running" flag from a dead worker.
+  const healable = (state.jobs ?? []).filter(
+    (j) => j.type === "task" && j.opencodeSessionId &&
+      ["starting", "investigating", "running", "finalizing"].includes(j.status),
+  );
+  if (healable.length > 0) {
+    await autoHealJobs(workspace, healable);
+    state = loadState(workspace);
+  }
+
   const lastTask = state.jobs
     ?.filter((j) => j.type === "task" && j.opencodeSessionId)
     ?.filter((j) => j.status === "completed" || j.status === "running")
@@ -429,8 +442,18 @@ async function handleStatus(argv) {
   });
 
   const workspace = await resolveWorkspace();
-  const state = loadState(workspace);
+  let state = loadState(workspace);
   const sessionId = getClaudeSessionId();
+  // Auto-heal stuck jobs before building the snapshot so `status` never lies
+  // about completion. Safe on ECONNREFUSED (probe returns reachable:false).
+  const healable = (state.jobs ?? []).filter(
+    (j) => j.opencodeSessionId &&
+      ["starting", "investigating", "running", "finalizing"].includes(j.status),
+  );
+  if (healable.length > 0) {
+    await autoHealJobs(workspace, healable);
+    state = loadState(workspace);
+  }
   const jobs = state.jobs ?? [];
   const wantJson = !!options.json;
   // --all widens the snapshot filter to every session's jobs; without --all we
@@ -486,7 +509,18 @@ async function handleResult(argv) {
   const ref = positional[0];
 
   const workspace = await resolveWorkspace();
-  const state = loadState(workspace);
+  let state = loadState(workspace);
+  // Auto-heal before resolving so that if the caller asks for the latest
+  // result, we don't return "no finished job" while a silently-completed
+  // session is waiting to be reconciled.
+  const healable = (state.jobs ?? []).filter(
+    (j) => j.opencodeSessionId &&
+      ["starting", "investigating", "running", "finalizing"].includes(j.status),
+  );
+  if (healable.length > 0) {
+    await autoHealJobs(workspace, healable);
+    state = loadState(workspace);
+  }
 
   const { job, ambiguous } = resolveResultJob(state.jobs ?? [], ref);
 
@@ -557,6 +591,56 @@ async function handleCancel(argv) {
   console.log(`Canceled job: ${job.id}`);
 }
 
+// ------------------------------------------------------------------
+// Heal (batch auto-reconcile stuck jobs)
+// ------------------------------------------------------------------
+
+async function handleHeal(argv) {
+  const { options } = parseArgs(argv ?? [], {
+    booleanOptions: ["json", "dry-run", "all"],
+  });
+
+  const workspace = await resolveWorkspace();
+  const state = loadState(workspace);
+  const sessionId = getClaudeSessionId();
+  const dryRun = !!options["dry-run"];
+
+  let jobs = state.jobs ?? [];
+  if (!options.all && sessionId) {
+    jobs = jobs.filter((j) => !j.sessionId || j.sessionId === sessionId);
+  }
+
+  const healable = jobs.filter(
+    (j) => j.opencodeSessionId &&
+      ["starting", "investigating", "running", "finalizing"].includes(j.status),
+  );
+
+  const { actions } = await autoHealJobs(workspace, healable, { dryRun });
+
+  if (options.json) {
+    console.log(JSON.stringify({
+      workspaceRoot: workspace,
+      dryRun,
+      scanned: healable.length,
+      actions,
+    }, null, 2));
+    return;
+  }
+
+  console.log(`## Auto-Heal ${dryRun ? "(dry-run)" : ""}\n`);
+  console.log(`- Workspace: ${workspace}`);
+  console.log(`- Scanned stuck jobs: ${healable.length}`);
+  if (actions.length === 0) {
+    console.log(`- No actions needed.`);
+    return;
+  }
+  console.log(`- Actions: ${actions.length}\n`);
+  for (const a of actions) {
+    const det = a.details ? ` — ${JSON.stringify(a.details)}` : "";
+    console.log(`- **${a.id}**: ${a.action}${det}`);
+  }
+}
+
 // ------------------------------------------------------------------
 // Helpers
 // ------------------------------------------------------------------

From deb34193c9573ce0df5d7f3dfa74e939c904596c Mon Sep 17 00:00:00 2001
From: suharvest <suharvest@gmail.com>
Date: Sat, 18 Apr 2026 14:37:34 +0800
Subject: [PATCH 7/7] feat(server): idle + bash-stuck detectors in sendPrompt
 watcher
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Raise the absolute prompt timeout to 4h as a pure safety cap and move
real stall detection into the watcher so long-but-alive tasks aren't
killed by a fixed deadline.

- Idle timeout (OPENCODE_IDLE_TIMEOUT_MS, default 15min): abort when
  the session shows no message/part/tool-output change for too long.
- Bash-tool stuck detector: when the latest tool is a bash in status
  running but `opencode serve` has zero child processes for N
  consecutive polls (default 3 × 5s), abort. This catches the
  ask-permission deadlock (sst/opencode#14473) where the shell process
  already exited cleanly but tool state never flipped to completed.
  Gracefully degrades on Windows or when lsof/pgrep is unavailable.
- Restructure fetch-vs-watcher race so a rejection from one side no
  longer cancels the other. The server's 5-min POST cap used to kill
  sendPrompt before the watcher could observe completion; now both
  settle independently and we prefer whichever succeeded.
---
 .../opencode/scripts/lib/opencode-server.mjs  | 189 +++++++++++++++++-
 1 file changed, 178 insertions(+), 11 deletions(-)

diff --git a/plugins/opencode/scripts/lib/opencode-server.mjs b/plugins/opencode/scripts/lib/opencode-server.mjs
index 1d32943..f0438f0 100644
--- a/plugins/opencode/scripts/lib/opencode-server.mjs
+++ b/plugins/opencode/scripts/lib/opencode-server.mjs
@@ -2,7 +2,12 @@
 // Unlike codex-plugin-cc which uses JSON-RPC over stdin/stdout,
 // OpenCode exposes a REST API + SSE. This module wraps that API.
 
-import { spawn } from "node:child_process";
+import { spawn, spawnSync } from "node:child_process";
+
+// Re-export for spec-compliance / discoverability: probeSessionTerminal lives
+// in auto-heal.mjs because it is tightly coupled to heal-decision logic, but
+// conceptually it is a server probe.
+export { probeSessionTerminal } from "./auto-heal.mjs";
 
 const DEFAULT_PORT = 4096;
 const DEFAULT_HOST = "127.0.0.1";
@@ -10,9 +15,69 @@ const SERVER_START_TIMEOUT = 30_000;
 
 // Long-running tasks (e.g. engine builds, large refactors) can easily exceed
 // the old 5-10 min caps, causing `fetch failed` at a fixed deadline. Default
-// to 30 min; override via env for even longer workloads.
+// PROMPT_TIMEOUT_MS to 4 hours — absolute safety cap. Real stall detection
+// lives in the watcher via IDLE_TIMEOUT_MS + pgrep child-process check.
 const REQUEST_TIMEOUT_MS = Number(process.env.OPENCODE_REQUEST_TIMEOUT_MS) || 1_800_000;
-const PROMPT_TIMEOUT_MS = Number(process.env.OPENCODE_PROMPT_TIMEOUT_MS) || 1_800_000;
+const PROMPT_TIMEOUT_MS = Number(process.env.OPENCODE_PROMPT_TIMEOUT_MS) || 14_400_000;
+// How long a session may go without ANY activity signal before we assume it
+// is stuck. Activity = new message, new parts, tool output growth, status
+// change. Default 15 min — long enough for most silent-but-alive tasks.
+const IDLE_TIMEOUT_MS = Number(process.env.OPENCODE_IDLE_TIMEOUT_MS) || 900_000;
+// Bash-tool "no child process" consecutive-miss threshold. If the latest
+// tool is a bash in status=running but opencode serve has zero child
+// processes for N polls in a row, declare stuck. 3 × 5s = 15s grace.
+const PGREP_MISS_THRESHOLD = Number(process.env.OPENCODE_PGREP_MISS_THRESHOLD) || 3;
+
+const IS_WINDOWS = process.platform === "win32";
+
+/**
+ * Find the PID of `opencode serve` listening on `port`, if we can.
+ * Returns null on Windows or any detection failure (caller degrades gracefully).
+ */
+function resolveServePid(port) {
+  if (IS_WINDOWS) return null;
+  try {
+    // macOS + Linux: lsof works the same way. Short timeout so we never block
+    // the watcher loop if the tool is slow/missing.
+    const r = spawnSync("lsof", ["-nP", `-iTCP:${port}`, "-sTCP:LISTEN"], {
+      encoding: "utf8",
+      timeout: 2000,
+    });
+    if (r.status !== 0 || !r.stdout) return null;
+    const lines = r.stdout.split("\n").slice(1).filter(Boolean);
+    for (const line of lines) {
+      const cols = line.trim().split(/\s+/);
+      const pid = Number(cols[1]);
+      if (Number.isInteger(pid) && pid > 0) return pid;
+    }
+  } catch {
+    // lsof missing or errored — degrade to no pgrep checks
+  }
+  return null;
+}
+
+/**
+ * Count direct child processes of `pid`. Returns:
+ *   -1 — feature unavailable (Windows, pgrep missing, etc.) — caller should skip check
+ *    0 — no children
+ *   >0 — that many children
+ */
+function countChildren(pid) {
+  if (!pid || IS_WINDOWS) return -1;
+  try {
+    const r = spawnSync("pgrep", ["-P", String(pid)], {
+      encoding: "utf8",
+      timeout: 2000,
+    });
+    if (r.error) return -1;
+    // pgrep exits 1 when no matches (empty stdout) — that's a real "zero", not a failure
+    const out = (r.stdout || "").trim();
+    if (!out) return 0;
+    return out.split("\n").filter(Boolean).length;
+  } catch {
+    return -1;
+  }
+}
 
 /**
  * Check if an OpenCode server is already running on the given port.
@@ -179,6 +244,21 @@ export function createClient(baseUrl, opts = {}) {
         // Wait briefly so the new generation has a chance to start and we
         // don't latch onto a stale completed message from before this prompt.
         await new Promise((r) => setTimeout(r, MIN_POLL_DELAY_MS));
+
+        // Resolve the opencode serve PID once so we can check for child
+        // processes later. If this fails (Windows, no lsof, permissions)
+        // we silently skip the pgrep-based stuck detector — idle timeout
+        // still covers most cases.
+        const urlObj = (() => {
+          try { return new URL(baseUrl); } catch { return null; }
+        })();
+        const port = Number(urlObj?.port) || DEFAULT_PORT;
+        const opencodePid = resolveServePid(port);
+
+        let prevSig = "";
+        let lastActivityMs = Date.now();
+        let pgrepMissCount = 0;
+
         while (!ac.signal.aborted) {
           try {
             const params = new URLSearchParams({ limit: "1" });
@@ -190,8 +270,30 @@ export function createClient(baseUrl, opts = {}) {
               const arr = await r.json();
               const last = Array.isArray(arr) ? arr[arr.length - 1] : null;
               const info = last?.info;
-              // Only treat assistant messages created *after* this prompt
-              // started as a completion signal for this call.
+              const parts = Array.isArray(last?.parts) ? last.parts : [];
+              // Most recent tool part — the one actually "running" if any.
+              let lastTool = null;
+              for (let i = parts.length - 1; i >= 0; i--) {
+                if (parts[i]?.type === "tool") { lastTool = parts[i]; break; }
+              }
+
+              // Activity signature: any change here = progress was made.
+              const sig = JSON.stringify({
+                mid: info?.id,
+                created: info?.time?.created,
+                completed: info?.time?.completed,
+                parts: parts.length,
+                tStatus: lastTool?.state?.status,
+                tOutLen: (lastTool?.state?.output || "").length,
+              });
+              if (sig !== prevSig) {
+                lastActivityMs = Date.now();
+                prevSig = sig;
+                pgrepMissCount = 0;
+              }
+
+              // Completion signal: assistant message created after our prompt
+              // started, with a terminal `finish` field populated.
               if (
                 info &&
                 info.role === "assistant" &&
@@ -201,23 +303,88 @@ export function createClient(baseUrl, opts = {}) {
               ) {
                 return { source: "watcher", data: last };
               }
+
+              // Bash-tool stuck detector: latest tool is bash in status=running
+              // but opencode serve has zero children for N consecutive polls.
+              // This is the signature of the "ask permission deadlock" bug
+              // (sst/opencode#14473): the shell process already exited cleanly
+              // but tool state never flipped to completed.
+              if (
+                opencodePid &&
+                lastTool?.tool === "bash" &&
+                lastTool?.state?.status === "running"
+              ) {
+                const n = countChildren(opencodePid);
+                if (n === 0) {
+                  pgrepMissCount += 1;
+                  if (pgrepMissCount >= PGREP_MISS_THRESHOLD) {
+                    ac.abort(
+                      new Error(
+                        `bash tool stuck — opencode serve (pid ${opencodePid}) has no child for ${pgrepMissCount} polls while tool.status=running`,
+                      ),
+                    );
+                    throw new Error("bash tool stuck (no child)");
+                  }
+                } else if (n > 0) {
+                  pgrepMissCount = 0;
+                }
+                // n === -1 → feature unavailable, don't count either way
+              }
+
+              // Idle timeout: nothing happened in the session for too long.
+              // Covers all tool types (not just bash), including non-pgrep
+              // platforms (Windows).
+              const idleMs = Date.now() - lastActivityMs;
+              if (idleMs > IDLE_TIMEOUT_MS) {
+                ac.abort(
+                  new Error(
+                    `session idle ${Math.floor(idleMs / 1000)}s > ${IDLE_TIMEOUT_MS / 1000}s`,
+                  ),
+                );
+                throw new Error("session idle timeout");
+              }
             }
-          } catch {
-            // Ignore transient poll errors; keep waiting.
+          } catch (err) {
+            // If we aborted above, propagate so the outer race sees a failure.
+            if (ac.signal.aborted) throw err;
+            // Otherwise it's a transient network/server blip — keep polling.
           }
           await new Promise((r) => setTimeout(r, POLL_INTERVAL_MS));
         }
         throw new Error("watcher aborted");
       })();
 
+      // Settle-wrap each so a single rejection doesn't lose the other side.
+      // Server-side 5-min POST cap means fetchPromise often rejects LONG
+      // before the agent is actually done; we must still wait on the watcher.
+      const wrap = (p, via) =>
+        p.then(
+          (v) => ({ ok: true, via, data: v.data }),
+          (err) => ({ ok: false, via, err }),
+        );
+      const runFetch = wrap(fetchPromise, "fetch");
+      const runWatcher = wrap(watcherPromise, "watcher");
+
       try {
-        const winner = await Promise.race([fetchPromise, watcherPromise]);
-        // Whichever arrived first, cancel the other.
+        const first = await Promise.race([runFetch, runWatcher]);
+        if (first.ok) {
+          ac.abort();
+          fetchPromise.catch(() => {});
+          watcherPromise.catch(() => {});
+          return first.data;
+        }
+        // First to settle was a failure — the other promise may still succeed.
+        // Do NOT abort yet: in particular, the watcher needs to keep polling
+        // when the POST was killed by the server's 5-min cap but generation
+        // is still running.
+        const second = first.via === "fetch" ? await runWatcher : await runFetch;
         ac.abort();
-        // Swallow the loser's rejection to avoid unhandled rejection noise.
         fetchPromise.catch(() => {});
         watcherPromise.catch(() => {});
-        return winner.data;
+        if (second.ok) return second.data;
+        // Both failed — surface the more informative error. Prefer the
+        // fetch error because it usually has the HTTP status/body.
+        throw first.via === "fetch" ? first.err : second.err;
       } finally {
         clearTimeout(timeoutId);
       }