From 180619cf8768a101eaea703dfbd478a48f4b95ee Mon Sep 17 00:00:00 2001
From: Eve <eve@ai.cowbay.org>
Date: Thu, 23 Apr 2026 15:22:02 +0800
Subject: [PATCH] feat: require concrete evidence for progress claims

---
 hooks/force-recall/handler.ts                 | 29 ++++++++++++++---
 scripts/long_task_gate_lock.mjs               | 32 +++++++++++++++++--
 .../test_force_recall_long_task_preflight.mjs | 11 ++++---
 3 files changed, 59 insertions(+), 13 deletions(-)
diff --git a/hooks/force-recall/handler.ts b/hooks/force-recall/handler.ts
index f2e1bd6..b4d96ec 100644
--- a/hooks/force-recall/handler.ts
+++ b/hooks/force-recall/handler.ts
@@ -85,6 +85,19 @@ async function runLongTaskWrapper(workspaceDir: string, ctx: any): Promise<any |
   return runJsonScript(wrapperPath, workspaceDir, input, LONG_TASK_WRAPPER_TIMEOUT_MS);
 }
 
+function buildProgressEvidence(wrapperResult: any): Record<string, unknown> | null {
+  const progressEvidence: Record<string, unknown> = {};
+
+  const taskName = typeof wrapperResult?.taskRecord?.task_name === "string"
+    ? wrapperResult.taskRecord.task_name.trim()
+    : "";
+  if (wrapperResult?.silentLaunchOk === true && taskName) {
+    progressEvidence.sessionKey = taskName;
+  }
+
+  return Object.keys(progressEvidence).length > 0 ? progressEvidence : null;
+}
+
 function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
   if (!wrapperResult || wrapperResult.classification !== "long_task") {
     return { classification: wrapperResult?.classification ?? "general_chat" };
@@ -92,6 +105,7 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
 
   const needsOwnerDecision = wrapperResult.needsOwnerDecision === true;
   const silentCandidate = wrapperResult.silentCandidate === true;
+  const progressEvidence = buildProgressEvidence(wrapperResult);
   const requiredNextAction = typeof wrapperResult.requiredNextAction === "string"
     ? wrapperResult.requiredNextAction.trim()
     : "";
@@ -130,6 +144,11 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
     claimedProgression: claimedProgression,
     statusSummary: claimedProgression,
     executionEvidence,
+    progressEvidence,
+    sessionKey: typeof progressEvidence?.sessionKey === "string" ? progressEvidence.sessionKey : "",
+    runId: typeof progressEvidence?.runId === "string" ? progressEvidence.runId : "",
+    modified_files: Array.isArray(progressEvidence?.modified_files) ? progressEvidence.modified_files : [],
+    verificationResult: typeof progressEvidence?.verificationResult === "string" ? progressEvidence.verificationResult : "",
     toolCallEvidence: "",
     dispatchEvidence: "",
     fileChangeEvidence: "",
@@ -201,8 +220,8 @@ function buildGateLockBlock(gateLockResult: GateLockResult | null): string {
       "gateStatus=degraded",
       "gateRequired=unknown",
       "- ENFORCEMENT: Gate-lock evaluator unavailable; keep existing long-task safeguards in force.",
-      "- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete evidence such as actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
-      "- HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable evidence.",
+      "- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete progress evidence such as a sessionKey, runId, modified_files record, verification result, actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
+      "- HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable progress evidence.",
       "- HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.",
       "[/LONG_TASK_GATE_LOCK]",
       "",
@@ -219,15 +238,15 @@ function buildGateLockBlock(gateLockResult: GateLockResult | null): string {
       return `requiredEvidence=${requirement.evidenceKey ?? "unknown"};fields=${fields};requiredValue=${requirement.requiredValue ?? "unknown"}`;
     })),
     ...((gateLockResult.allowedResponseModes ?? []).map((mode) => `allowedResponseMode=${mode}`)),
-    "- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete evidence such as actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
+    "- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete progress evidence such as a sessionKey, runId, modified_files record, verification result, actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
     "- ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.",
     "- ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.",
   ];
 
   if (gateLockResult.gateStatus === "fail") {
     lines.push("- HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.");
-    lines.push("- HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete execution evidence.");
-    lines.push("- HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to actual tool execution, file changes, emitted messages, or checkpoint records.");
+    lines.push("- HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete progress evidence.");
+    lines.push("- HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.");
     lines.push("- HARD_GATE: If required evidence is missing, ask for/produce the checkpoint or downgrade to a non-silent, evidence-preserving follow-up.");
     lines.push("- HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.");
   }
diff --git a/scripts/long_task_gate_lock.mjs b/scripts/long_task_gate_lock.mjs
index 87edab6..a1eca38 100644
--- a/scripts/long_task_gate_lock.mjs
+++ b/scripts/long_task_gate_lock.mjs
@@ -30,6 +30,17 @@ const EVIDENCE_FIELDS = Object.freeze({
     'verificationEvidence',
     'checkpointArtifactEvidence',
   ]),
+  progressEvidence: Object.freeze([
+    'progressEvidence',
+    'progressEvidence.sessionKey',
+    'progressEvidence.runId',
+    'progressEvidence.modified_files',
+    'progressEvidence.verificationResult',
+    'sessionKey',
+    'runId',
+    'modified_files',
+    'verificationResult',
+  ]),
 });
 
 const GATE_REQUIREMENTS = Object.freeze({
@@ -53,6 +64,11 @@ const GATE_REQUIREMENTS = Object.freeze({
     acceptedFields: EVIDENCE_FIELDS.executionEvidence,
     requiredValue: 'tool call, dispatch, file change, verification output, or checkpoint artifact evidence',
   }),
+  progressEvidence: Object.freeze({
+    evidenceKey: 'progressEvidence',
+    acceptedFields: EVIDENCE_FIELDS.progressEvidence,
+    requiredValue: 'sessionKey, runId, modified_files, verification result, or equivalent concrete progress evidence',
+  }),
 });
 
 function fail(code, message) {
@@ -164,6 +180,16 @@ function hasExecutionEvidence(input) {
   });
 }
 
+function hasProgressEvidence(input) {
+  return EVIDENCE_FIELDS.progressEvidence.some((fieldPath) => {
+    const value = getPathValue(input, fieldPath);
+    if (hasNonEmptyString(value)) return true;
+    if (Array.isArray(value)) return value.length > 0;
+    if (value && typeof value === 'object') return Object.keys(value).length > 0;
+    return false;
+  });
+}
+
 function claimsProgressionWithoutEvidence(input) {
   const progressionClaim = EVIDENCE_FIELDS.progressionClaim
     .map((fieldPath) => getPathValue(input, fieldPath))
@@ -171,7 +197,7 @@ function claimsProgressionWithoutEvidence(input) {
 
   if (!hasNonEmptyString(progressionClaim)) return false;
 
-  return !hasExecutionEvidence(input);
+  return !(hasProgressEvidence(input) || hasExecutionEvidence(input));
 }
 
 function evaluateGate(input) {
@@ -215,8 +241,8 @@ function evaluateGate(input) {
 
   if (claimsProgressionWithoutEvidence(input)) {
     failed = true;
-    reasons.push('claimed progression without concrete execution evidence is forbidden');
-    requiredEvidence.push(describeRequirement(GATE_REQUIREMENTS.executionEvidence));
+    reasons.push('claimed progression without concrete progress evidence is forbidden');
+    requiredEvidence.push(describeRequirement(GATE_REQUIREMENTS.progressEvidence));
     allowedResponseModes.push('evidence_preserving_follow_up');
   }
 
diff --git a/scripts/test_force_recall_long_task_preflight.mjs b/scripts/test_force_recall_long_task_preflight.mjs
index e36aead..e8e937c 100644
--- a/scripts/test_force_recall_long_task_preflight.mjs
+++ b/scripts/test_force_recall_long_task_preflight.mjs
@@ -66,11 +66,12 @@ async function main() {
     'reason=silent long-task cannot continue without externalized checkpoint path',
     'reason=claimed execution requires evidence of a concrete next action',
     'reason=owner decision flow must end in button-path, not plain text',
-    'reason=claimed progression without concrete execution evidence is forbidden',
-    'requiredEvidence=executionEvidence',
+    'reason=claimed progression without concrete progress evidence is forbidden',
+    'requiredEvidence=progressEvidence',
+    'requiredValue=sessionKey, runId, modified_files, verification result, or equivalent concrete progress evidence',
     'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
-    'HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete execution evidence.',
-    'HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to actual tool execution, file changes, emitted messages, or checkpoint records.',
+    'HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete progress evidence.',
+    'HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.',
     'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
     'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
   ];
@@ -98,7 +99,7 @@ async function main() {
     '[LONG_TASK_GATE_LOCK]',
     'gateStatus=degraded',
     'gateRequired=unknown',
-    'HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable evidence.',
+    'HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable progress evidence.',
     'HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.',
   ];