Initial import of watchdog-discord-route skill

2026-04-22 08:33:51 +08:00
commit 8138fb011d
22 changed files with 2447 additions and 0 deletions
--- a/scripts/notify_watchdog_b.py
+++ b/scripts/notify_watchdog_b.py
@@ -0,0 +1,467 @@
+#!/usr/bin/env python3
+from __future__ import annotations
+
+import argparse
+import json
+import os
+import shutil
+import subprocess
+import sys
+from datetime import datetime, timezone
+from pathlib import Path
+from typing import Any
+
+SCRIPT_DIR = Path(__file__).resolve().parent
+SKILL_DIR = SCRIPT_DIR.parent
+WORKSPACE = Path(os.environ.get("WATCHDOG_B_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
+CONFIG_FILE = Path(os.environ.get("WATCHDOG_B_CONFIG_FILE", str(Path.home() / ".config" / "openclaw" / "watchdog-b.env")))
+LIVE_SCRIPT_DIR = Path(os.environ.get("WATCHDOG_B_LIVE_SCRIPT_DIR", str(WORKSPACE / "scripts" / "watchdog-b")))
+
+
+def load_env_file(path: Path) -> None:
+    if not path.exists():
+        return
+    for raw_line in path.read_text(encoding="utf-8").splitlines():
+        line = raw_line.strip()
+        if not line or line.startswith("#") or "=" not in line:
+            continue
+        key, value = line.split("=", 1)
+        key = key.strip()
+        if not key:
+            continue
+        value = value.strip()
+        if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
+            value = value[1:-1]
+        os.environ.setdefault(key, value)
+
+
+load_env_file(CONFIG_FILE)
+
+STATE_DIR = Path(os.environ.get("WATCHDOG_B_ARTIFACT_DIR", str(WORKSPACE / "state" / "watchdog-b")))
+NOTIFY_STATE_PATH = STATE_DIR / "notify-state.json"
+OWNER_PRODUCER = Path(os.environ.get("WATCHDOG_B_OWNER_PRODUCER", str(SCRIPT_DIR / "owner_report_producer.py")))
+OWNER_DRIVER = Path(os.environ.get("WATCHDOG_B_OWNER_DRIVER", str(SCRIPT_DIR / "owner_report_driver.py")))
+PYTHON_BIN = os.environ.get("WATCHDOG_B_PYTHON_BIN", sys.executable or "python3")
+WATCHDOG_OWNER_REPORT_CHANNEL = os.environ.get("WATCHDOG_B_OWNER_REPORT_CHANNEL", "discord")
+WATCHDOG_OWNER_REPORT_TARGET = os.environ.get("WATCHDOG_B_OWNER_REPORT_TARGET", "channel:REPLACE_ME")
+WATCHDOG_MAIN_AGENT_ID = os.environ.get("WATCHDOG_B_MAIN_AGENT_ID", "").strip()
+HOSTNAME = os.uname().nodename
+UTC = timezone.utc
+RUNTIME_PROBE = Path(os.environ.get("WATCHDOG_B_RUNTIME_PROBE", str(SCRIPT_DIR / "openclaw_runtime_probe.py")))
+RUNTIME_CACHE: dict[str, Path] | None = None
+
+DEFAULTS = {
+    "running_min_interval_seconds": 3600,
+    "stalled_nudge_min_interval_seconds": 900,
+    "idle_nudge_min_interval_seconds": 1800,
+    "stalled_owner_escalation_after": 2,
+    "idle_owner_escalation_after": 2,
+}
+
+
+def now_iso() -> str:
+    return datetime.now().astimezone().isoformat(timespec="seconds")
+
+
+def path_or_none(value: str | None) -> Path | None:
+    if not value:
+        return None
+    return Path(value).expanduser()
+
+
+def detect_runtime_paths() -> dict[str, Path]:
+    global RUNTIME_CACHE
+    if RUNTIME_CACHE is not None:
+        return RUNTIME_CACHE
+
+    node_bin = path_or_none(os.environ.get("WATCHDOG_B_NODE_BIN"))
+    openclaw_mjs = path_or_none(os.environ.get("WATCHDOG_B_OPENCLAW_MJS"))
+    openclaw_entry = path_or_none(os.environ.get("WATCHDOG_B_OPENCLAW_ENTRY"))
+
+    if node_bin and node_bin.exists() and os.access(node_bin, os.X_OK) and openclaw_mjs and openclaw_mjs.is_file() and openclaw_entry and openclaw_entry.is_file():
+        RUNTIME_CACHE = {
+            "node": node_bin,
+            "openclaw_mjs": openclaw_mjs,
+            "openclaw_entry": openclaw_entry,
+        }
+        return RUNTIME_CACHE
+
+    if RUNTIME_PROBE.exists():
+        proc = subprocess.run([PYTHON_BIN, str(RUNTIME_PROBE)], text=True, capture_output=True)
+        if proc.returncode == 0:
+            payload = json.loads(proc.stdout)
+            detected = payload.get("detected", {})
+            RUNTIME_CACHE = {
+                "node": Path(detected["node"]),
+                "openclaw_mjs": Path(detected["openclaw_mjs"]),
+                "openclaw_entry": Path(detected["openclaw_entry"]),
+            }
+            return RUNTIME_CACHE
+
+    node_which = shutil.which("node")
+    if node_which:
+        node_bin = Path(node_which)
+
+    missing = []
+    if not node_bin or not node_bin.exists():
+        missing.append("WATCHDOG_B_NODE_BIN")
+    if not openclaw_mjs or not openclaw_mjs.is_file():
+        missing.append("WATCHDOG_B_OPENCLAW_MJS")
+    if not openclaw_entry or not openclaw_entry.is_file():
+        missing.append("WATCHDOG_B_OPENCLAW_ENTRY")
+    raise RuntimeError(
+        "Unable to auto-detect watchdog runtime paths. Missing: " + ", ".join(missing)
+    )
+
+
+def load_state() -> dict[str, Any]:
+    if NOTIFY_STATE_PATH.exists():
+        try:
+            return json.loads(NOTIFY_STATE_PATH.read_text(encoding="utf-8"))
+        except Exception:
+            pass
+    return {"events": {}}
+
+
+def save_state(data: dict[str, Any]) -> None:
+    STATE_DIR.mkdir(parents=True, exist_ok=True)
+    NOTIFY_STATE_PATH.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
+
+
+def event_bucket(state: str) -> dict[str, Any]:
+    data = load_state()
+    events = data.setdefault("events", {})
+    bucket = events.setdefault(state, {})
+    return data
+
+
+def get_bucket(data: dict[str, Any], state: str) -> dict[str, Any]:
+    events = data.setdefault("events", {})
+    return events.setdefault(state, {})
+
+
+def should_send(bucket: dict[str, Any], min_interval_seconds: int, timestamp: datetime) -> tuple[bool, str]:
+    last_sent = bucket.get("last_sent_at")
+    if not last_sent:
+        return True, "first-send"
+    try:
+        prev = datetime.fromisoformat(last_sent)
+    except Exception:
+        return True, "state-corrupt-reset"
+    elapsed = (timestamp - prev).total_seconds()
+    if elapsed >= min_interval_seconds:
+        return True, f"interval-ok:{int(elapsed)}s"
+    return False, f"throttled:{int(elapsed)}s<{min_interval_seconds}s"
+
+
+def mark_sent(bucket: dict[str, Any], channel: str, timestamp: str, detail: dict[str, Any] | None = None) -> None:
+    bucket["last_sent_at"] = timestamp
+    bucket["last_channel"] = channel
+    bucket["send_count"] = int(bucket.get("send_count", 0)) + 1
+    bucket["last_detail"] = detail or {}
+
+
+def build_owner_message(state: str, timestamp: str, detail: str) -> dict[str, str]:
+    emoji_default = {
+        "running": "✅",
+        "stalled": "⚠️",
+        "idle": "🛑",
+    }
+    summary_default = {
+        "running": "主程序仍在運行",
+        "stalled": "主程序疑似卡住",
+        "idle": "主程序目前未運行",
+    }
+    progress_default = {
+        "running": "running",
+        "stalled": "stalled",
+        "idle": "idle",
+    }
+    status_default = {
+        "running": "normal",
+        "stalled": "needs-attention",
+        "idle": "needs-attention",
+    }
+    source_default = {
+        "running": "watchdog-b-running",
+        "stalled": "watchdog-b-stalled-escalation",
+        "idle": "watchdog-b-idle-escalation",
+    }
+    detail_default = {
+        "running": f"checked_at={timestamp} host={HOSTNAME}",
+        "stalled": f"checked_at={timestamp} host={HOSTNAME}; stale activity detected while process still looked alive",
+        "idle": f"checked_at={timestamp} host={HOSTNAME}; no active main runtime detected",
+    }
+    return {
+        "progress": os.environ.get(f"WATCHDOG_B_{state.upper()}_PROGRESS_LABEL", progress_default[state]),
+        "done": f"{os.environ.get(f'WATCHDOG_B_{state.upper()}_EMOJI', emoji_default[state])} {os.environ.get(f'WATCHDOG_B_{state.upper()}_SUMMARY', summary_default[state])}",
+        "next": detail or os.environ.get(f"WATCHDOG_B_{state.upper()}_DETAIL", detail_default[state]),
+        "status": os.environ.get(f"WATCHDOG_B_{state.upper()}_STATUS", status_default[state]),
+        "source": os.environ.get(f"WATCHDOG_B_{state.upper()}_SOURCE", source_default[state]),
+    }
+
+
+def enqueue_owner_report(*, state: str, timestamp: str, dry_run: bool, detail: str) -> dict[str, Any]:
+    msg = build_owner_message(state, timestamp, detail)
+    report_id = f"watchdog-b-{state}-{datetime.now(UTC).strftime('%Y%m%dT%H%M%SZ')}"
+    cmd = [
+        PYTHON_BIN,
+        str(OWNER_PRODUCER),
+        "--team",
+        "watchdog-b",
+        "--worker",
+        HOSTNAME,
+        "--task-id",
+        f"openclaw-main-{state}",
+        "--progress",
+        msg["progress"],
+        "--done",
+        msg["done"],
+        "--next",
+        msg["next"],
+        "--status",
+        msg["status"],
+        "--source",
+        msg["source"],
+        "--report-id",
+        report_id,
+    ]
+    if dry_run:
+        cmd.append("--dry-run")
+    proc = subprocess.run(cmd, text=True, capture_output=True)
+    result = {
+        "kind": "owner-report-enqueue",
+        "ok": proc.returncode == 0,
+        "command": cmd,
+        "exit_code": proc.returncode,
+        "stdout": proc.stdout,
+        "stderr": proc.stderr,
+        "report_id": report_id,
+        "dry_run": dry_run,
+    }
+    if proc.returncode == 0 and not dry_run:
+        result["pending_path"] = str(Path.home() / ".clawteam" / "owner-reports" / "pending" / f"{report_id}.md")
+    return result
+
+
+def build_owner_send_cmd() -> str:
+    runtime = detect_runtime_paths()
+    return (
+        f'"{runtime["node"]}" "{runtime["openclaw_entry"]}" message send '
+        f'--channel {WATCHDOG_OWNER_REPORT_CHANNEL} '
+        f"--target '{WATCHDOG_OWNER_REPORT_TARGET}' "
+        f'--message "$OWNER_REPORT_MESSAGE"'
+    )
+
+
+def deliver_owner_report(*, report_id: str, dry_run: bool) -> dict[str, Any]:
+    send_cmd = build_owner_send_cmd()
+    cmd = [PYTHON_BIN, str(OWNER_DRIVER), report_id, "--send-cmd", send_cmd]
+    if dry_run:
+        cmd.append("--dry-run")
+    proc = subprocess.run(cmd, text=True, capture_output=True)
+    return {
+        "kind": "owner-report-direct-delivery",
+        "ok": proc.returncode == 0,
+        "command": cmd,
+        "send_cmd": send_cmd,
+        "exit_code": proc.returncode,
+        "stdout": proc.stdout,
+        "stderr": proc.stderr,
+        "dry_run": dry_run,
+        "report_id": report_id,
+        "target_channel": WATCHDOG_OWNER_REPORT_CHANNEL,
+        "target": WATCHDOG_OWNER_REPORT_TARGET,
+    }
+
+
+def call_main_agent(*, state: str, timestamp: str, dry_run: bool) -> dict[str, Any]:
+    message = (
+        f"[watchdog-b][{state}] {timestamp}\n"
+        f"Host: {HOSTNAME}\n"
+        f"Please confirm current task state, whether progress is blocked, and whether owner-facing escalation is needed."
+    )
+    if not WATCHDOG_MAIN_AGENT_ID:
+        return {
+            "kind": "main-agent-nudge",
+            "ok": True,
+            "skipped": True,
+            "reason": "WATCHDOG_B_MAIN_AGENT_ID not configured",
+            "dry_run": dry_run,
+            "message": message,
+        }
+    try:
+        runtime = detect_runtime_paths()
+    except Exception as exc:
+        return {
+            "kind": "main-agent-nudge",
+            "ok": False,
+            "dry_run": dry_run,
+            "error": str(exc),
+            "message": message,
+        }
+    cmd = [
+        str(runtime["node"]),
+        str(runtime["openclaw_mjs"]),
+        "agent",
+        "--agent",
+        WATCHDOG_MAIN_AGENT_ID,
+        "--message",
+        message,
+        "--timeout",
+        os.environ.get("WATCHDOG_B_MAIN_AGENT_TIMEOUT", "120"),
+    ]
+    if dry_run:
+        return {"kind": "main-agent-nudge", "ok": True, "dry_run": True, "command": cmd, "message": message}
+    try:
+        proc = subprocess.run(cmd, text=True, capture_output=True, timeout=int(os.environ.get("WATCHDOG_B_MAIN_AGENT_TIMEOUT", "120")) + 10)
+        return {
+            "kind": "main-agent-nudge",
+            "ok": proc.returncode == 0,
+            "dry_run": False,
+            "command": cmd,
+            "exit_code": proc.returncode,
+            "stdout": proc.stdout,
+            "stderr": proc.stderr,
+            "message": message,
+        }
+    except subprocess.TimeoutExpired as e:
+        return {
+            "kind": "main-agent-nudge",
+            "ok": False,
+            "dry_run": False,
+            "command": cmd,
+            "timeout": True,
+            "stdout": e.stdout,
+            "stderr": e.stderr,
+            "message": message,
+        }
+
+
+def maybe_running_report(data: dict[str, Any], bucket: dict[str, Any], timestamp: str, dry_run: bool) -> dict[str, Any]:
+    mode = os.environ.get("WATCHDOG_B_RUNNING_REPORT_MODE", "manual").lower()
+    min_interval = int(os.environ.get("WATCHDOG_B_RUNNING_REPORT_MIN_INTERVAL_SECONDS", str(DEFAULTS["running_min_interval_seconds"])))
+    allowed, reason = should_send(bucket, min_interval, datetime.fromisoformat(timestamp))
+    result: dict[str, Any] = {
+        "state": "running",
+        "route": "owner-report",
+        "mode": mode,
+        "allowed": allowed,
+        "reason": reason,
+        "dry_run": dry_run,
+    }
+    if mode not in {"manual", "enqueue", "enqueue-and-drain"}:
+        result.update({"ok": False, "error": f"unsupported running mode: {mode}"})
+        return result
+    if mode == "manual":
+        result.update({
+            "ok": True,
+            "action": "manual-only",
+            "hint": "set WATCHDOG_B_RUNNING_REPORT_MODE=enqueue to create a real pending item, or enqueue-and-drain to enqueue and directly deliver it to Discord",
+        })
+        return result
+    if not allowed:
+        result.update({"ok": True, "action": "suppressed"})
+        return result
+    enqueue = enqueue_owner_report(state="running", timestamp=timestamp, dry_run=dry_run, detail="Main runtime alive and log activity fresh.")
+    result["enqueue"] = enqueue
+    result["ok"] = enqueue.get("ok", False)
+    if enqueue.get("ok"):
+        mark_sent(bucket, "owner-report-enqueue", timestamp, {"report_id": enqueue.get("report_id")})
+    if mode == "enqueue-and-drain" and enqueue.get("ok"):
+        deliver = deliver_owner_report(report_id=enqueue.get("report_id"), dry_run=dry_run)
+        result["deliver"] = deliver
+        result["ok"] = result["ok"] and deliver.get("ok", False)
+        if deliver.get("ok"):
+            mark_sent(bucket, "owner-report-direct-delivery", timestamp, {"report_id": enqueue.get("report_id")})
+    return result
+
+
+def maybe_nudge_and_escalate(data: dict[str, Any], bucket: dict[str, Any], *, state: str, timestamp: str, dry_run: bool) -> dict[str, Any]:
+    is_stalled = state == "stalled"
+    nudge_min = int(os.environ.get(
+        "WATCHDOG_B_STALLED_NUDGE_MIN_INTERVAL_SECONDS" if is_stalled else "WATCHDOG_B_IDLE_NUDGE_MIN_INTERVAL_SECONDS",
+        str(DEFAULTS["stalled_nudge_min_interval_seconds"] if is_stalled else DEFAULTS["idle_nudge_min_interval_seconds"]),
+    ))
+    escalation_after = int(os.environ.get(
+        "WATCHDOG_B_STALLED_OWNER_ESCALATION_AFTER" if is_stalled else "WATCHDOG_B_IDLE_OWNER_ESCALATION_AFTER",
+        str(DEFAULTS["stalled_owner_escalation_after"] if is_stalled else DEFAULTS["idle_owner_escalation_after"]),
+    ))
+    owner_mode = os.environ.get(
+        "WATCHDOG_B_STALLED_OWNER_MODE" if is_stalled else "WATCHDOG_B_IDLE_OWNER_MODE",
+        "escalate",
+    ).lower()
+
+    bucket["seen_count"] = int(bucket.get("seen_count", 0)) + 1
+    allowed, reason = should_send(bucket, nudge_min, datetime.fromisoformat(timestamp))
+    result: dict[str, Any] = {
+        "state": state,
+        "route": "main-agent-then-owner",
+        "allowed": allowed,
+        "reason": reason,
+        "seen_count": bucket["seen_count"],
+        "owner_mode": owner_mode,
+        "dry_run": dry_run,
+    }
+
+    if allowed:
+        nudge = call_main_agent(state=state, timestamp=timestamp, dry_run=dry_run)
+        result["main_agent_nudge"] = nudge
+        if nudge.get("ok"):
+            mark_sent(bucket, "main-agent", timestamp, {"state": state})
+        result["ok"] = nudge.get("ok", False)
+    else:
+        result.update({"ok": True, "action": "nudge-suppressed"})
+
+    should_escalate = owner_mode in {"always", "escalate"} and bucket["seen_count"] >= escalation_after
+    if owner_mode == "never":
+        should_escalate = False
+
+    if should_escalate:
+        owner_allowed, owner_reason = should_send(bucket, nudge_min, datetime.fromisoformat(timestamp))
+        result["owner_escalation_gate"] = {"allowed": owner_allowed, "reason": owner_reason, "threshold": escalation_after}
+        if owner_allowed:
+            detail = "Main agent was nudged repeatedly; please review whether manual intervention is needed."
+            enqueue = enqueue_owner_report(state=state, timestamp=timestamp, dry_run=dry_run, detail=detail)
+            result["owner_enqueue"] = enqueue
+            result["ok"] = result.get("ok", True) and enqueue.get("ok", False)
+            if enqueue.get("ok"):
+                mark_sent(bucket, "owner-report-enqueue", timestamp, {"report_id": enqueue.get("report_id"), "state": state})
+                owner_delivery_mode = os.environ.get(
+                    "WATCHDOG_B_OWNER_DELIVERY_MODE",
+                    "enqueue-only",
+                ).lower()
+                result["owner_delivery_mode"] = owner_delivery_mode
+                if owner_delivery_mode == "direct-discord":
+                    deliver = deliver_owner_report(report_id=enqueue.get("report_id"), dry_run=dry_run)
+                    result["owner_deliver"] = deliver
+                    result["ok"] = result.get("ok", True) and deliver.get("ok", False)
+                    if deliver.get("ok"):
+                        mark_sent(bucket, "owner-report-direct-delivery", timestamp, {"report_id": enqueue.get("report_id"), "state": state})
+    return result
+
+
+def main() -> int:
+    ap = argparse.ArgumentParser(description="Notification layer for watchdog-b")
+    ap.add_argument("--state", required=True, choices=["running", "stalled", "idle"])
+    ap.add_argument("--timestamp", default=now_iso())
+    ap.add_argument("--dry-run", action="store_true")
+    args = ap.parse_args()
+
+    data = load_state()
+    bucket = get_bucket(data, args.state)
+
+    if args.state == "running":
+        result = maybe_running_report(data, bucket, args.timestamp, args.dry_run)
+    else:
+        result = maybe_nudge_and_escalate(data, bucket, state=args.state, timestamp=args.timestamp, dry_run=args.dry_run)
+
+    bucket["last_seen_at"] = args.timestamp
+    bucket["last_result"] = result
+    save_state(data)
+    print(json.dumps(result, ensure_ascii=False, indent=2))
+    return 0 if result.get("ok", False) else 1
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())