feat: require auto-chain action evidence

This commit is contained in:
Eve
2026-04-23 19:34:24 +08:00
parent 242f7ce463
commit 17dd26cde7
3 changed files with 78 additions and 11 deletions

View File

@@ -146,11 +146,18 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
"bind_externalized_checkpoint_path_or_abort_silent_launch", "bind_externalized_checkpoint_path_or_abort_silent_launch",
].includes(requiredNextAction), ].includes(requiredNextAction),
); );
const autoChainNextAction = hasConcreteExecutionEvidence ? requiredNextAction : "";
const executionEvidence = hasConcreteExecutionEvidence const executionEvidence = hasConcreteExecutionEvidence
? { ? {
concreteNextAction: requiredNextAction, concreteNextAction: requiredNextAction,
} }
: null; : null;
const autoChainDispatchEvidence = progressEvidence && hasConcreteExecutionEvidence
? {
sessionKey: typeof progressEvidence.sessionKey === "string" ? progressEvidence.sessionKey : "",
concreteNextAction: requiredNextAction,
}
: null;
const claimedProgression = shouldClaimProgression(wrapperResult, progressEvidence) const claimedProgression = shouldClaimProgression(wrapperResult, progressEvidence)
? "already progressing to the next step in background" ? "already progressing to the next step in background"
: ""; : "";
@@ -170,6 +177,8 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
nextStep: hasConcreteExecutionEvidence ? requiredNextAction : "", nextStep: hasConcreteExecutionEvidence ? requiredNextAction : "",
requiredNextAction: hasConcreteExecutionEvidence ? requiredNextAction : "", requiredNextAction: hasConcreteExecutionEvidence ? requiredNextAction : "",
concreteNextAction: hasConcreteExecutionEvidence ? requiredNextAction : "", concreteNextAction: hasConcreteExecutionEvidence ? requiredNextAction : "",
autoChainNextAction,
autoChainDispatchEvidence,
progressionClaim: claimedProgression, progressionClaim: claimedProgression,
claimedProgression: claimedProgression, claimedProgression: claimedProgression,
statusSummary: claimedProgression, statusSummary: claimedProgression,
@@ -274,6 +283,7 @@ function buildGateLockBlock(gateLockResult: GateLockResult | null): string {
"- ENFORCEMENT: Hook input should include progressEvidence (or equivalent concrete fields) whenever a progression claim is present.", "- ENFORCEMENT: Hook input should include progressEvidence (or equivalent concrete fields) whenever a progression claim is present.",
"- ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.", "- ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.",
"- ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.", "- ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.",
"- ENFORCEMENT: If hook input carries autoChainNextAction, it must also carry matching autoChainDispatchEvidence before the gate may pass that auto-chain step.",
]; ];
if (gateLockResult.gateStatus === "fail") { if (gateLockResult.gateStatus === "fail") {
@@ -282,6 +292,7 @@ function buildGateLockBlock(gateLockResult: GateLockResult | null): string {
lines.push("- HARD_GATE: If a progression claim exists, the hook input must supply progressEvidence (or equivalent concrete fields) before the claim can pass gate."); lines.push("- HARD_GATE: If a progression claim exists, the hook input must supply progressEvidence (or equivalent concrete fields) before the claim can pass gate.");
lines.push("- HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records."); lines.push("- HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.");
lines.push("- HARD_GATE: If required evidence is missing, ask for/produce the checkpoint or downgrade to a non-silent, evidence-preserving follow-up."); lines.push("- HARD_GATE: If required evidence is missing, ask for/produce the checkpoint or downgrade to a non-silent, evidence-preserving follow-up.");
lines.push("- HARD_GATE: If autoChainNextAction is explicit, you must actually dispatch it and surface autoChainDispatchEvidence; otherwise the gate fails.");
lines.push("- HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff."); lines.push("- HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.");
} }

View File

@@ -30,6 +30,14 @@ const EVIDENCE_FIELDS = Object.freeze({
'verificationEvidence', 'verificationEvidence',
'checkpointArtifactEvidence', 'checkpointArtifactEvidence',
]), ]),
autoChainNextAction: Object.freeze([
'autoChainNextAction',
'auto_chain_next_action',
]),
autoChainDispatchEvidence: Object.freeze([
'autoChainDispatchEvidence',
'auto_chain_dispatch_evidence',
]),
progressEvidence: Object.freeze([ progressEvidence: Object.freeze([
'progressEvidence', 'progressEvidence',
'progressEvidence.sessionKey', 'progressEvidence.sessionKey',
@@ -64,6 +72,11 @@ const GATE_REQUIREMENTS = Object.freeze({
acceptedFields: EVIDENCE_FIELDS.executionEvidence, acceptedFields: EVIDENCE_FIELDS.executionEvidence,
requiredValue: 'tool call, dispatch, file change, verification output, or checkpoint artifact evidence', requiredValue: 'tool call, dispatch, file change, verification output, or checkpoint artifact evidence',
}), }),
autoChainDispatchEvidence: Object.freeze({
evidenceKey: 'autoChainDispatchEvidence',
acceptedFields: EVIDENCE_FIELDS.autoChainDispatchEvidence,
requiredValue: 'dispatched-action evidence for the explicit auto-chain next action',
}),
progressEvidence: Object.freeze({ progressEvidence: Object.freeze({
evidenceKey: 'progressEvidence', evidenceKey: 'progressEvidence',
acceptedFields: EVIDENCE_FIELDS.progressEvidence, acceptedFields: EVIDENCE_FIELDS.progressEvidence,
@@ -180,6 +193,29 @@ function hasExecutionEvidence(input) {
}); });
} }
function hasExplicitAutoChainNextAction(input) {
return hasAnyNonEmptyString(input, EVIDENCE_FIELDS.autoChainNextAction);
}
function hasAutoChainDispatchEvidence(input) {
return EVIDENCE_FIELDS.autoChainDispatchEvidence.some((fieldPath) => {
const value = getPathValue(input, fieldPath);
if (hasNonEmptyString(value)) return true;
if (Array.isArray(value)) return value.length > 0;
if (value && typeof value === 'object') return Object.keys(value).length > 0;
return false;
});
}
function requiresAutoChainDispatchEvidence(input) {
if (!hasExplicitAutoChainNextAction(input)) return false;
const nextAction = EVIDENCE_FIELDS.autoChainNextAction
.map((fieldPath) => getPathValue(input, fieldPath))
.find((value) => hasNonEmptyString(value));
if (!hasNonEmptyString(nextAction)) return false;
return /^([a-z]+_)+[a-z]+$/i.test(nextAction.trim());
}
function hasProgressEvidence(input) { function hasProgressEvidence(input) {
return EVIDENCE_FIELDS.progressEvidence.some((fieldPath) => { return EVIDENCE_FIELDS.progressEvidence.some((fieldPath) => {
const value = getPathValue(input, fieldPath); const value = getPathValue(input, fieldPath);
@@ -249,6 +285,13 @@ function evaluateGate(input) {
allowedResponseModes.push('evidence_preserving_follow_up'); allowedResponseModes.push('evidence_preserving_follow_up');
} }
if (requiresAutoChainDispatchEvidence(input) && !hasAutoChainDispatchEvidence(input)) {
failed = true;
reasons.push('explicit auto-chain next action requires dispatched-action evidence');
requiredEvidence.push(describeRequirement(GATE_REQUIREMENTS.autoChainDispatchEvidence));
allowedResponseModes.push('dispatch_required');
}
if (!failed) { if (!failed) {
reasons.push('required long-task gate evidence is present or no gated condition was triggered'); reasons.push('required long-task gate evidence is present or no gated condition was triggered');
allowedResponseModes.push(needsOwnerDecision(input) ? 'button_path' : 'direct_reply'); allowedResponseModes.push(needsOwnerDecision(input) ? 'button_path' : 'direct_reply');

View File

@@ -106,6 +106,8 @@ async function main() {
classification: 'long_task', classification: 'long_task',
claimedExecution: true, claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent', concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
autoChainDispatchEvidence: { sessionKey: 'task-123', dispatched: 'dispatch_follow_up_subagent' },
progressionClaim: 'already progressing to the next step in background', progressionClaim: 'already progressing to the next step in background',
progressEvidence: { sessionKey: 'task-123' }, progressEvidence: { sessionKey: 'task-123' },
}); });
@@ -115,20 +117,30 @@ async function main() {
classification: 'long_task', classification: 'long_task',
claimedExecution: true, claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent', concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
progressionClaim: 'already progressing to the next step in background', progressionClaim: 'already progressing to the next step in background',
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' }, executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' },
}); });
assert.equal(failResult.gateStatus, 'fail', 'fail-path should fail when progressionClaim lacks progressEvidence'); assert.equal(failResult.gateStatus, 'fail', 'fail-path should fail when explicit auto-chain action lacks dispatch evidence');
assert.match(JSON.stringify(failResult), /progressEvidence/, 'fail-path should require progressEvidence'); assert.match(JSON.stringify(failResult), /autoChainDispatchEvidence/, 'fail-path should require autoChainDispatchEvidence');
const neutralResult = evaluateGate({ const neutralResult = evaluateGate({
classification: 'long_task', classification: 'long_task',
claimedExecution: true, claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent', concreteNextAction: 'summarize findings for reply',
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' }, executionEvidence: { concreteNextAction: 'summarize findings for reply' },
}); });
assert.equal(neutralResult.gateStatus, 'pass', 'neutral-path should pass when there is no progression claim'); assert.equal(neutralResult.gateStatus, 'pass', 'neutral-path should pass when there is no explicit auto-chain next action');
assert.doesNotMatch(JSON.stringify(neutralResult), /progressEvidence/, 'neutral-path should not require progressEvidence'); assert.doesNotMatch(JSON.stringify(neutralResult), /autoChainDispatchEvidence/, 'neutral-path should not require auto-chain dispatch evidence');
const directAutoChainFailResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
});
assert.equal(directAutoChainFailResult.gateStatus, 'fail', 'direct evaluator should fail when explicit auto-chain action has no dispatch evidence');
assert.match(JSON.stringify(directAutoChainFailResult), /explicit auto-chain next action requires dispatched-action evidence/, 'direct evaluator fail-path should mention missing dispatched-action evidence');
const passInjected = await withPatchedWrapper(buildWrapperScript({ const passInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task', classification: 'long_task',
@@ -154,8 +166,9 @@ async function main() {
requiredNextAction: 'dispatch_follow_up_subagent', requiredNextAction: 'dispatch_follow_up_subagent',
handoff: { mode: 'direct_reply' }, handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText)); }), async () => runScenario(forceRecall, requestText));
assert.match(failInjected, /gateStatus=fail/, 'hook fail-path should fail when wrapper claims progression without progressEvidence'); assert.match(failInjected, /gateStatus=fail/, 'hook fail-path should fail when wrapper exposes explicit auto-chain action without dispatch evidence');
assert.match(failInjected, /reason=claimed progression without concrete progress evidence is forbidden/, 'hook fail-path should mention missing progress evidence'); assert.match(failInjected, /reason=explicit auto-chain next action requires dispatched-action evidence/, 'hook fail-path should mention missing dispatched-action evidence');
assert.match(failInjected, /requiredEvidence=autoChainDispatchEvidence/, 'hook fail-path should require autoChainDispatchEvidence');
const neutralInjected = await withPatchedWrapper(buildWrapperScript({ const neutralInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task', classification: 'long_task',
@@ -164,11 +177,11 @@ async function main() {
needsSubagent: false, needsSubagent: false,
needsOwnerDecision: false, needsOwnerDecision: false,
silentLaunchOk: false, silentLaunchOk: false,
requiredNextAction: 'summarize_findings_for_reply', requiredNextAction: 'summarize findings for reply',
handoff: { mode: 'direct_reply' }, handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText)); }), async () => runScenario(forceRecall, requestText));
assert.match(neutralInjected, /gateStatus=pass/, 'hook neutral-path should pass when wrapper does not claim progression'); assert.match(neutralInjected, /gateStatus=pass/, 'hook neutral-path should pass when wrapper does not expose an explicit auto-chain action');
assert.doesNotMatch(neutralInjected, /reason=claimed progression without concrete progress evidence is forbidden/, 'hook neutral-path should not fail on missing progress evidence without a progression claim'); assert.doesNotMatch(neutralInjected, /reason=explicit auto-chain next action requires dispatched-action evidence/, 'hook neutral-path should not fail on auto-chain evidence when no explicit tool action exists');
const originalGateLock = await fs.readFile(gateLockPath, 'utf8'); const originalGateLock = await fs.readFile(gateLockPath, 'utf8');
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'force-recall-gate-lock-')); const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'force-recall-gate-lock-'));