Initial import of watchdog-discord-route skill
This commit is contained in:
467
scripts/notify_watchdog_b.py
Executable file
467
scripts/notify_watchdog_b.py
Executable file
@@ -0,0 +1,467 @@
|
||||
#!/usr/bin/env python3
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import os
|
||||
import shutil
|
||||
import subprocess
|
||||
import sys
|
||||
from datetime import datetime, timezone
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
SCRIPT_DIR = Path(__file__).resolve().parent
|
||||
SKILL_DIR = SCRIPT_DIR.parent
|
||||
WORKSPACE = Path(os.environ.get("WATCHDOG_B_WORKSPACE", str(Path.home() / ".openclaw" / "workspace")))
|
||||
CONFIG_FILE = Path(os.environ.get("WATCHDOG_B_CONFIG_FILE", str(Path.home() / ".config" / "openclaw" / "watchdog-b.env")))
|
||||
LIVE_SCRIPT_DIR = Path(os.environ.get("WATCHDOG_B_LIVE_SCRIPT_DIR", str(WORKSPACE / "scripts" / "watchdog-b")))
|
||||
|
||||
|
||||
def load_env_file(path: Path) -> None:
|
||||
if not path.exists():
|
||||
return
|
||||
for raw_line in path.read_text(encoding="utf-8").splitlines():
|
||||
line = raw_line.strip()
|
||||
if not line or line.startswith("#") or "=" not in line:
|
||||
continue
|
||||
key, value = line.split("=", 1)
|
||||
key = key.strip()
|
||||
if not key:
|
||||
continue
|
||||
value = value.strip()
|
||||
if (value.startswith('"') and value.endswith('"')) or (value.startswith("'") and value.endswith("'")):
|
||||
value = value[1:-1]
|
||||
os.environ.setdefault(key, value)
|
||||
|
||||
|
||||
load_env_file(CONFIG_FILE)
|
||||
|
||||
STATE_DIR = Path(os.environ.get("WATCHDOG_B_ARTIFACT_DIR", str(WORKSPACE / "state" / "watchdog-b")))
|
||||
NOTIFY_STATE_PATH = STATE_DIR / "notify-state.json"
|
||||
OWNER_PRODUCER = Path(os.environ.get("WATCHDOG_B_OWNER_PRODUCER", str(SCRIPT_DIR / "owner_report_producer.py")))
|
||||
OWNER_DRIVER = Path(os.environ.get("WATCHDOG_B_OWNER_DRIVER", str(SCRIPT_DIR / "owner_report_driver.py")))
|
||||
PYTHON_BIN = os.environ.get("WATCHDOG_B_PYTHON_BIN", sys.executable or "python3")
|
||||
WATCHDOG_OWNER_REPORT_CHANNEL = os.environ.get("WATCHDOG_B_OWNER_REPORT_CHANNEL", "discord")
|
||||
WATCHDOG_OWNER_REPORT_TARGET = os.environ.get("WATCHDOG_B_OWNER_REPORT_TARGET", "channel:REPLACE_ME")
|
||||
WATCHDOG_MAIN_AGENT_ID = os.environ.get("WATCHDOG_B_MAIN_AGENT_ID", "").strip()
|
||||
HOSTNAME = os.uname().nodename
|
||||
UTC = timezone.utc
|
||||
RUNTIME_PROBE = Path(os.environ.get("WATCHDOG_B_RUNTIME_PROBE", str(SCRIPT_DIR / "openclaw_runtime_probe.py")))
|
||||
RUNTIME_CACHE: dict[str, Path] | None = None
|
||||
|
||||
DEFAULTS = {
|
||||
"running_min_interval_seconds": 3600,
|
||||
"stalled_nudge_min_interval_seconds": 900,
|
||||
"idle_nudge_min_interval_seconds": 1800,
|
||||
"stalled_owner_escalation_after": 2,
|
||||
"idle_owner_escalation_after": 2,
|
||||
}
|
||||
|
||||
|
||||
def now_iso() -> str:
|
||||
return datetime.now().astimezone().isoformat(timespec="seconds")
|
||||
|
||||
|
||||
def path_or_none(value: str | None) -> Path | None:
|
||||
if not value:
|
||||
return None
|
||||
return Path(value).expanduser()
|
||||
|
||||
|
||||
def detect_runtime_paths() -> dict[str, Path]:
|
||||
global RUNTIME_CACHE
|
||||
if RUNTIME_CACHE is not None:
|
||||
return RUNTIME_CACHE
|
||||
|
||||
node_bin = path_or_none(os.environ.get("WATCHDOG_B_NODE_BIN"))
|
||||
openclaw_mjs = path_or_none(os.environ.get("WATCHDOG_B_OPENCLAW_MJS"))
|
||||
openclaw_entry = path_or_none(os.environ.get("WATCHDOG_B_OPENCLAW_ENTRY"))
|
||||
|
||||
if node_bin and node_bin.exists() and os.access(node_bin, os.X_OK) and openclaw_mjs and openclaw_mjs.is_file() and openclaw_entry and openclaw_entry.is_file():
|
||||
RUNTIME_CACHE = {
|
||||
"node": node_bin,
|
||||
"openclaw_mjs": openclaw_mjs,
|
||||
"openclaw_entry": openclaw_entry,
|
||||
}
|
||||
return RUNTIME_CACHE
|
||||
|
||||
if RUNTIME_PROBE.exists():
|
||||
proc = subprocess.run([PYTHON_BIN, str(RUNTIME_PROBE)], text=True, capture_output=True)
|
||||
if proc.returncode == 0:
|
||||
payload = json.loads(proc.stdout)
|
||||
detected = payload.get("detected", {})
|
||||
RUNTIME_CACHE = {
|
||||
"node": Path(detected["node"]),
|
||||
"openclaw_mjs": Path(detected["openclaw_mjs"]),
|
||||
"openclaw_entry": Path(detected["openclaw_entry"]),
|
||||
}
|
||||
return RUNTIME_CACHE
|
||||
|
||||
node_which = shutil.which("node")
|
||||
if node_which:
|
||||
node_bin = Path(node_which)
|
||||
|
||||
missing = []
|
||||
if not node_bin or not node_bin.exists():
|
||||
missing.append("WATCHDOG_B_NODE_BIN")
|
||||
if not openclaw_mjs or not openclaw_mjs.is_file():
|
||||
missing.append("WATCHDOG_B_OPENCLAW_MJS")
|
||||
if not openclaw_entry or not openclaw_entry.is_file():
|
||||
missing.append("WATCHDOG_B_OPENCLAW_ENTRY")
|
||||
raise RuntimeError(
|
||||
"Unable to auto-detect watchdog runtime paths. Missing: " + ", ".join(missing)
|
||||
)
|
||||
|
||||
|
||||
def load_state() -> dict[str, Any]:
|
||||
if NOTIFY_STATE_PATH.exists():
|
||||
try:
|
||||
return json.loads(NOTIFY_STATE_PATH.read_text(encoding="utf-8"))
|
||||
except Exception:
|
||||
pass
|
||||
return {"events": {}}
|
||||
|
||||
|
||||
def save_state(data: dict[str, Any]) -> None:
|
||||
STATE_DIR.mkdir(parents=True, exist_ok=True)
|
||||
NOTIFY_STATE_PATH.write_text(json.dumps(data, ensure_ascii=False, indent=2) + "\n", encoding="utf-8")
|
||||
|
||||
|
||||
def event_bucket(state: str) -> dict[str, Any]:
|
||||
data = load_state()
|
||||
events = data.setdefault("events", {})
|
||||
bucket = events.setdefault(state, {})
|
||||
return data
|
||||
|
||||
|
||||
def get_bucket(data: dict[str, Any], state: str) -> dict[str, Any]:
|
||||
events = data.setdefault("events", {})
|
||||
return events.setdefault(state, {})
|
||||
|
||||
|
||||
def should_send(bucket: dict[str, Any], min_interval_seconds: int, timestamp: datetime) -> tuple[bool, str]:
|
||||
last_sent = bucket.get("last_sent_at")
|
||||
if not last_sent:
|
||||
return True, "first-send"
|
||||
try:
|
||||
prev = datetime.fromisoformat(last_sent)
|
||||
except Exception:
|
||||
return True, "state-corrupt-reset"
|
||||
elapsed = (timestamp - prev).total_seconds()
|
||||
if elapsed >= min_interval_seconds:
|
||||
return True, f"interval-ok:{int(elapsed)}s"
|
||||
return False, f"throttled:{int(elapsed)}s<{min_interval_seconds}s"
|
||||
|
||||
|
||||
def mark_sent(bucket: dict[str, Any], channel: str, timestamp: str, detail: dict[str, Any] | None = None) -> None:
|
||||
bucket["last_sent_at"] = timestamp
|
||||
bucket["last_channel"] = channel
|
||||
bucket["send_count"] = int(bucket.get("send_count", 0)) + 1
|
||||
bucket["last_detail"] = detail or {}
|
||||
|
||||
|
||||
def build_owner_message(state: str, timestamp: str, detail: str) -> dict[str, str]:
|
||||
emoji_default = {
|
||||
"running": "✅",
|
||||
"stalled": "⚠️",
|
||||
"idle": "🛑",
|
||||
}
|
||||
summary_default = {
|
||||
"running": "主程序仍在運行",
|
||||
"stalled": "主程序疑似卡住",
|
||||
"idle": "主程序目前未運行",
|
||||
}
|
||||
progress_default = {
|
||||
"running": "running",
|
||||
"stalled": "stalled",
|
||||
"idle": "idle",
|
||||
}
|
||||
status_default = {
|
||||
"running": "normal",
|
||||
"stalled": "needs-attention",
|
||||
"idle": "needs-attention",
|
||||
}
|
||||
source_default = {
|
||||
"running": "watchdog-b-running",
|
||||
"stalled": "watchdog-b-stalled-escalation",
|
||||
"idle": "watchdog-b-idle-escalation",
|
||||
}
|
||||
detail_default = {
|
||||
"running": f"checked_at={timestamp} host={HOSTNAME}",
|
||||
"stalled": f"checked_at={timestamp} host={HOSTNAME}; stale activity detected while process still looked alive",
|
||||
"idle": f"checked_at={timestamp} host={HOSTNAME}; no active main runtime detected",
|
||||
}
|
||||
return {
|
||||
"progress": os.environ.get(f"WATCHDOG_B_{state.upper()}_PROGRESS_LABEL", progress_default[state]),
|
||||
"done": f"{os.environ.get(f'WATCHDOG_B_{state.upper()}_EMOJI', emoji_default[state])} {os.environ.get(f'WATCHDOG_B_{state.upper()}_SUMMARY', summary_default[state])}",
|
||||
"next": detail or os.environ.get(f"WATCHDOG_B_{state.upper()}_DETAIL", detail_default[state]),
|
||||
"status": os.environ.get(f"WATCHDOG_B_{state.upper()}_STATUS", status_default[state]),
|
||||
"source": os.environ.get(f"WATCHDOG_B_{state.upper()}_SOURCE", source_default[state]),
|
||||
}
|
||||
|
||||
|
||||
def enqueue_owner_report(*, state: str, timestamp: str, dry_run: bool, detail: str) -> dict[str, Any]:
|
||||
msg = build_owner_message(state, timestamp, detail)
|
||||
report_id = f"watchdog-b-{state}-{datetime.now(UTC).strftime('%Y%m%dT%H%M%SZ')}"
|
||||
cmd = [
|
||||
PYTHON_BIN,
|
||||
str(OWNER_PRODUCER),
|
||||
"--team",
|
||||
"watchdog-b",
|
||||
"--worker",
|
||||
HOSTNAME,
|
||||
"--task-id",
|
||||
f"openclaw-main-{state}",
|
||||
"--progress",
|
||||
msg["progress"],
|
||||
"--done",
|
||||
msg["done"],
|
||||
"--next",
|
||||
msg["next"],
|
||||
"--status",
|
||||
msg["status"],
|
||||
"--source",
|
||||
msg["source"],
|
||||
"--report-id",
|
||||
report_id,
|
||||
]
|
||||
if dry_run:
|
||||
cmd.append("--dry-run")
|
||||
proc = subprocess.run(cmd, text=True, capture_output=True)
|
||||
result = {
|
||||
"kind": "owner-report-enqueue",
|
||||
"ok": proc.returncode == 0,
|
||||
"command": cmd,
|
||||
"exit_code": proc.returncode,
|
||||
"stdout": proc.stdout,
|
||||
"stderr": proc.stderr,
|
||||
"report_id": report_id,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
if proc.returncode == 0 and not dry_run:
|
||||
result["pending_path"] = str(Path.home() / ".clawteam" / "owner-reports" / "pending" / f"{report_id}.md")
|
||||
return result
|
||||
|
||||
|
||||
def build_owner_send_cmd() -> str:
|
||||
runtime = detect_runtime_paths()
|
||||
return (
|
||||
f'"{runtime["node"]}" "{runtime["openclaw_entry"]}" message send '
|
||||
f'--channel {WATCHDOG_OWNER_REPORT_CHANNEL} '
|
||||
f"--target '{WATCHDOG_OWNER_REPORT_TARGET}' "
|
||||
f'--message "$OWNER_REPORT_MESSAGE"'
|
||||
)
|
||||
|
||||
|
||||
def deliver_owner_report(*, report_id: str, dry_run: bool) -> dict[str, Any]:
|
||||
send_cmd = build_owner_send_cmd()
|
||||
cmd = [PYTHON_BIN, str(OWNER_DRIVER), report_id, "--send-cmd", send_cmd]
|
||||
if dry_run:
|
||||
cmd.append("--dry-run")
|
||||
proc = subprocess.run(cmd, text=True, capture_output=True)
|
||||
return {
|
||||
"kind": "owner-report-direct-delivery",
|
||||
"ok": proc.returncode == 0,
|
||||
"command": cmd,
|
||||
"send_cmd": send_cmd,
|
||||
"exit_code": proc.returncode,
|
||||
"stdout": proc.stdout,
|
||||
"stderr": proc.stderr,
|
||||
"dry_run": dry_run,
|
||||
"report_id": report_id,
|
||||
"target_channel": WATCHDOG_OWNER_REPORT_CHANNEL,
|
||||
"target": WATCHDOG_OWNER_REPORT_TARGET,
|
||||
}
|
||||
|
||||
|
||||
def call_main_agent(*, state: str, timestamp: str, dry_run: bool) -> dict[str, Any]:
|
||||
message = (
|
||||
f"[watchdog-b][{state}] {timestamp}\n"
|
||||
f"Host: {HOSTNAME}\n"
|
||||
f"Please confirm current task state, whether progress is blocked, and whether owner-facing escalation is needed."
|
||||
)
|
||||
if not WATCHDOG_MAIN_AGENT_ID:
|
||||
return {
|
||||
"kind": "main-agent-nudge",
|
||||
"ok": True,
|
||||
"skipped": True,
|
||||
"reason": "WATCHDOG_B_MAIN_AGENT_ID not configured",
|
||||
"dry_run": dry_run,
|
||||
"message": message,
|
||||
}
|
||||
try:
|
||||
runtime = detect_runtime_paths()
|
||||
except Exception as exc:
|
||||
return {
|
||||
"kind": "main-agent-nudge",
|
||||
"ok": False,
|
||||
"dry_run": dry_run,
|
||||
"error": str(exc),
|
||||
"message": message,
|
||||
}
|
||||
cmd = [
|
||||
str(runtime["node"]),
|
||||
str(runtime["openclaw_mjs"]),
|
||||
"agent",
|
||||
"--agent",
|
||||
WATCHDOG_MAIN_AGENT_ID,
|
||||
"--message",
|
||||
message,
|
||||
"--timeout",
|
||||
os.environ.get("WATCHDOG_B_MAIN_AGENT_TIMEOUT", "120"),
|
||||
]
|
||||
if dry_run:
|
||||
return {"kind": "main-agent-nudge", "ok": True, "dry_run": True, "command": cmd, "message": message}
|
||||
try:
|
||||
proc = subprocess.run(cmd, text=True, capture_output=True, timeout=int(os.environ.get("WATCHDOG_B_MAIN_AGENT_TIMEOUT", "120")) + 10)
|
||||
return {
|
||||
"kind": "main-agent-nudge",
|
||||
"ok": proc.returncode == 0,
|
||||
"dry_run": False,
|
||||
"command": cmd,
|
||||
"exit_code": proc.returncode,
|
||||
"stdout": proc.stdout,
|
||||
"stderr": proc.stderr,
|
||||
"message": message,
|
||||
}
|
||||
except subprocess.TimeoutExpired as e:
|
||||
return {
|
||||
"kind": "main-agent-nudge",
|
||||
"ok": False,
|
||||
"dry_run": False,
|
||||
"command": cmd,
|
||||
"timeout": True,
|
||||
"stdout": e.stdout,
|
||||
"stderr": e.stderr,
|
||||
"message": message,
|
||||
}
|
||||
|
||||
|
||||
def maybe_running_report(data: dict[str, Any], bucket: dict[str, Any], timestamp: str, dry_run: bool) -> dict[str, Any]:
|
||||
mode = os.environ.get("WATCHDOG_B_RUNNING_REPORT_MODE", "manual").lower()
|
||||
min_interval = int(os.environ.get("WATCHDOG_B_RUNNING_REPORT_MIN_INTERVAL_SECONDS", str(DEFAULTS["running_min_interval_seconds"])))
|
||||
allowed, reason = should_send(bucket, min_interval, datetime.fromisoformat(timestamp))
|
||||
result: dict[str, Any] = {
|
||||
"state": "running",
|
||||
"route": "owner-report",
|
||||
"mode": mode,
|
||||
"allowed": allowed,
|
||||
"reason": reason,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
if mode not in {"manual", "enqueue", "enqueue-and-drain"}:
|
||||
result.update({"ok": False, "error": f"unsupported running mode: {mode}"})
|
||||
return result
|
||||
if mode == "manual":
|
||||
result.update({
|
||||
"ok": True,
|
||||
"action": "manual-only",
|
||||
"hint": "set WATCHDOG_B_RUNNING_REPORT_MODE=enqueue to create a real pending item, or enqueue-and-drain to enqueue and directly deliver it to Discord",
|
||||
})
|
||||
return result
|
||||
if not allowed:
|
||||
result.update({"ok": True, "action": "suppressed"})
|
||||
return result
|
||||
enqueue = enqueue_owner_report(state="running", timestamp=timestamp, dry_run=dry_run, detail="Main runtime alive and log activity fresh.")
|
||||
result["enqueue"] = enqueue
|
||||
result["ok"] = enqueue.get("ok", False)
|
||||
if enqueue.get("ok"):
|
||||
mark_sent(bucket, "owner-report-enqueue", timestamp, {"report_id": enqueue.get("report_id")})
|
||||
if mode == "enqueue-and-drain" and enqueue.get("ok"):
|
||||
deliver = deliver_owner_report(report_id=enqueue.get("report_id"), dry_run=dry_run)
|
||||
result["deliver"] = deliver
|
||||
result["ok"] = result["ok"] and deliver.get("ok", False)
|
||||
if deliver.get("ok"):
|
||||
mark_sent(bucket, "owner-report-direct-delivery", timestamp, {"report_id": enqueue.get("report_id")})
|
||||
return result
|
||||
|
||||
|
||||
def maybe_nudge_and_escalate(data: dict[str, Any], bucket: dict[str, Any], *, state: str, timestamp: str, dry_run: bool) -> dict[str, Any]:
|
||||
is_stalled = state == "stalled"
|
||||
nudge_min = int(os.environ.get(
|
||||
"WATCHDOG_B_STALLED_NUDGE_MIN_INTERVAL_SECONDS" if is_stalled else "WATCHDOG_B_IDLE_NUDGE_MIN_INTERVAL_SECONDS",
|
||||
str(DEFAULTS["stalled_nudge_min_interval_seconds"] if is_stalled else DEFAULTS["idle_nudge_min_interval_seconds"]),
|
||||
))
|
||||
escalation_after = int(os.environ.get(
|
||||
"WATCHDOG_B_STALLED_OWNER_ESCALATION_AFTER" if is_stalled else "WATCHDOG_B_IDLE_OWNER_ESCALATION_AFTER",
|
||||
str(DEFAULTS["stalled_owner_escalation_after"] if is_stalled else DEFAULTS["idle_owner_escalation_after"]),
|
||||
))
|
||||
owner_mode = os.environ.get(
|
||||
"WATCHDOG_B_STALLED_OWNER_MODE" if is_stalled else "WATCHDOG_B_IDLE_OWNER_MODE",
|
||||
"escalate",
|
||||
).lower()
|
||||
|
||||
bucket["seen_count"] = int(bucket.get("seen_count", 0)) + 1
|
||||
allowed, reason = should_send(bucket, nudge_min, datetime.fromisoformat(timestamp))
|
||||
result: dict[str, Any] = {
|
||||
"state": state,
|
||||
"route": "main-agent-then-owner",
|
||||
"allowed": allowed,
|
||||
"reason": reason,
|
||||
"seen_count": bucket["seen_count"],
|
||||
"owner_mode": owner_mode,
|
||||
"dry_run": dry_run,
|
||||
}
|
||||
|
||||
if allowed:
|
||||
nudge = call_main_agent(state=state, timestamp=timestamp, dry_run=dry_run)
|
||||
result["main_agent_nudge"] = nudge
|
||||
if nudge.get("ok"):
|
||||
mark_sent(bucket, "main-agent", timestamp, {"state": state})
|
||||
result["ok"] = nudge.get("ok", False)
|
||||
else:
|
||||
result.update({"ok": True, "action": "nudge-suppressed"})
|
||||
|
||||
should_escalate = owner_mode in {"always", "escalate"} and bucket["seen_count"] >= escalation_after
|
||||
if owner_mode == "never":
|
||||
should_escalate = False
|
||||
|
||||
if should_escalate:
|
||||
owner_allowed, owner_reason = should_send(bucket, nudge_min, datetime.fromisoformat(timestamp))
|
||||
result["owner_escalation_gate"] = {"allowed": owner_allowed, "reason": owner_reason, "threshold": escalation_after}
|
||||
if owner_allowed:
|
||||
detail = "Main agent was nudged repeatedly; please review whether manual intervention is needed."
|
||||
enqueue = enqueue_owner_report(state=state, timestamp=timestamp, dry_run=dry_run, detail=detail)
|
||||
result["owner_enqueue"] = enqueue
|
||||
result["ok"] = result.get("ok", True) and enqueue.get("ok", False)
|
||||
if enqueue.get("ok"):
|
||||
mark_sent(bucket, "owner-report-enqueue", timestamp, {"report_id": enqueue.get("report_id"), "state": state})
|
||||
owner_delivery_mode = os.environ.get(
|
||||
"WATCHDOG_B_OWNER_DELIVERY_MODE",
|
||||
"enqueue-only",
|
||||
).lower()
|
||||
result["owner_delivery_mode"] = owner_delivery_mode
|
||||
if owner_delivery_mode == "direct-discord":
|
||||
deliver = deliver_owner_report(report_id=enqueue.get("report_id"), dry_run=dry_run)
|
||||
result["owner_deliver"] = deliver
|
||||
result["ok"] = result.get("ok", True) and deliver.get("ok", False)
|
||||
if deliver.get("ok"):
|
||||
mark_sent(bucket, "owner-report-direct-delivery", timestamp, {"report_id": enqueue.get("report_id"), "state": state})
|
||||
return result
|
||||
|
||||
|
||||
def main() -> int:
|
||||
ap = argparse.ArgumentParser(description="Notification layer for watchdog-b")
|
||||
ap.add_argument("--state", required=True, choices=["running", "stalled", "idle"])
|
||||
ap.add_argument("--timestamp", default=now_iso())
|
||||
ap.add_argument("--dry-run", action="store_true")
|
||||
args = ap.parse_args()
|
||||
|
||||
data = load_state()
|
||||
bucket = get_bucket(data, args.state)
|
||||
|
||||
if args.state == "running":
|
||||
result = maybe_running_report(data, bucket, args.timestamp, args.dry_run)
|
||||
else:
|
||||
result = maybe_nudge_and_escalate(data, bucket, state=args.state, timestamp=args.timestamp, dry_run=args.dry_run)
|
||||
|
||||
bucket["last_seen_at"] = args.timestamp
|
||||
bucket["last_result"] = result
|
||||
save_state(data)
|
||||
print(json.dumps(result, ensure_ascii=False, indent=2))
|
||||
return 0 if result.get("ok", False) else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
raise SystemExit(main())
|
||||
Reference in New Issue
Block a user