fix: avoid false positives in progress-evidence gate

This commit is contained in:
Eve
2026-04-23 18:13:13 +08:00
parent 83077adcda
commit 242f7ce463
3 changed files with 149 additions and 24 deletions

113
scripts/test_force_recall_long_task_preflight.mjs Normal file → Executable file
View File

@@ -40,6 +40,20 @@ async function runScenario(forceRecall, requestText) {
return injected;
}
async function withPatchedWrapper(tempContent, callback) {
const originalWrapper = await fs.readFile(wrapperPath, 'utf8');
await fs.writeFile(wrapperPath, tempContent, 'utf8');
try {
return await callback();
} finally {
await fs.writeFile(wrapperPath, originalWrapper, 'utf8');
}
}
function buildWrapperScript(wrapperResult) {
return `#!/usr/bin/env node\nprocess.stdout.write(JSON.stringify(${JSON.stringify(wrapperResult)}, null, 0) + "\\n");\n`;
}
async function main() {
await Promise.all([fs.access(wrapperPath), fs.access(gateLockPath)]);
const { default: forceRecall } = await importTsModule(handlerPath);
@@ -66,21 +80,95 @@ async function main() {
'reason=silent long-task cannot continue without externalized checkpoint path',
'reason=claimed execution requires evidence of a concrete next action',
'reason=owner decision flow must end in button-path, not plain text',
'reason=claimed progression without concrete progress evidence is forbidden',
'requiredEvidence=progressEvidence',
'requiredValue=sessionKey, runId, modified_files, verification result, or equivalent concrete progress evidence',
'ENFORCEMENT: Hook input should include progressEvidence (or equivalent concrete fields) whenever a progression claim is present.',
'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
'HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete progress evidence.',
'HARD_GATE: If a progression claim exists, the hook input must supply progressEvidence (or equivalent concrete fields) before the claim can pass gate.',
'HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.',
'HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.',
'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
];
const unexpectedSnippets = [
'reason=claimed progression without concrete progress evidence is forbidden',
'requiredEvidence=progressEvidence',
];
for (const snippet of expectedSnippets) {
assert.match(injected, new RegExp(escapeRegex(snippet)), `missing snippet: ${snippet}`);
}
for (const snippet of unexpectedSnippets) {
assert.doesNotMatch(injected, new RegExp(escapeRegex(snippet)), `unexpected snippet present: ${snippet}`);
}
const { evaluateGate } = await import(pathToFileURL(gateLockPath).href + `?t=${Date.now()}`);
assert.equal(typeof evaluateGate, 'function', 'long_task_gate_lock should export evaluateGate for direct tests');
const passResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
progressionClaim: 'already progressing to the next step in background',
progressEvidence: { sessionKey: 'task-123' },
});
assert.equal(passResult.gateStatus, 'pass', 'pass-path should pass with concrete progressEvidence');
const failResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
progressionClaim: 'already progressing to the next step in background',
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' },
});
assert.equal(failResult.gateStatus, 'fail', 'fail-path should fail when progressionClaim lacks progressEvidence');
assert.match(JSON.stringify(failResult), /progressEvidence/, 'fail-path should require progressEvidence');
const neutralResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' },
});
assert.equal(neutralResult.gateStatus, 'pass', 'neutral-path should pass when there is no progression claim');
assert.doesNotMatch(JSON.stringify(neutralResult), /progressEvidence/, 'neutral-path should not require progressEvidence');
const passInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task',
silentCandidate: true,
needsCheckpoint: true,
needsSubagent: false,
needsOwnerDecision: false,
silentLaunchOk: true,
silentLaunchReason: 'checkpoint established',
requiredNextAction: 'dispatch_follow_up_subagent',
taskRecord: { task_name: 'task-123' },
handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText));
assert.match(passInjected, /gateStatus=pass/, 'hook pass-path should pass when wrapper provides concrete progressEvidence');
const failInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task',
silentCandidate: false,
needsCheckpoint: false,
needsSubagent: false,
needsOwnerDecision: false,
silentLaunchOk: false,
requiredNextAction: 'dispatch_follow_up_subagent',
handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText));
assert.match(failInjected, /gateStatus=fail/, 'hook fail-path should fail when wrapper claims progression without progressEvidence');
assert.match(failInjected, /reason=claimed progression without concrete progress evidence is forbidden/, 'hook fail-path should mention missing progress evidence');
const neutralInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task',
silentCandidate: false,
needsCheckpoint: false,
needsSubagent: false,
needsOwnerDecision: false,
silentLaunchOk: false,
requiredNextAction: 'summarize_findings_for_reply',
handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText));
assert.match(neutralInjected, /gateStatus=pass/, 'hook neutral-path should pass when wrapper does not claim progression');
assert.doesNotMatch(neutralInjected, /reason=claimed progression without concrete progress evidence is forbidden/, 'hook neutral-path should not fail on missing progress evidence without a progression claim');
const originalGateLock = await fs.readFile(gateLockPath, 'utf8');
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'force-recall-gate-lock-'));
@@ -109,14 +197,15 @@ async function main() {
assert.match(degradedInjected, new RegExp(escapeRegex(snippet)), `missing degraded snippet: ${snippet}`);
}
const summary = {
process.stdout.write(JSON.stringify({
ok: true,
checked: expectedSnippets,
degradedChecked: degradedExpectedSnippets,
gatePaths: {
pass: passResult.gateStatus,
fail: failResult.gateStatus,
neutral: neutralResult.gateStatus,
},
bodyPreview: injected.split('\n').slice(0, 35),
};
process.stdout.write(JSON.stringify(summary, null, 2) + '\n');
}, null, 2) + '\n');
}
main().catch((error) => {