fix: avoid false positives in progress-evidence gate
This commit is contained in:
@@ -98,6 +98,33 @@ function buildProgressEvidence(wrapperResult: any): Record<string, unknown> | nu
|
|||||||
return Object.keys(progressEvidence).length > 0 ? progressEvidence : null;
|
return Object.keys(progressEvidence).length > 0 ? progressEvidence : null;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function shouldClaimProgression(wrapperResult: any, progressEvidence: Record<string, unknown> | null): boolean {
|
||||||
|
if (!wrapperResult || wrapperResult.classification !== "long_task") return false;
|
||||||
|
if (progressEvidence && Object.keys(progressEvidence).length > 0) return true;
|
||||||
|
|
||||||
|
const requiredNextAction = typeof wrapperResult.requiredNextAction === "string"
|
||||||
|
? wrapperResult.requiredNextAction.trim()
|
||||||
|
: "";
|
||||||
|
const progressingActionPrefixes = [
|
||||||
|
"dispatch_",
|
||||||
|
"handoff_",
|
||||||
|
"launch_",
|
||||||
|
"resume_",
|
||||||
|
"continue_",
|
||||||
|
"queue_",
|
||||||
|
"schedule_",
|
||||||
|
"run_",
|
||||||
|
"start_",
|
||||||
|
"spawn_",
|
||||||
|
];
|
||||||
|
|
||||||
|
if (requiredNextAction && progressingActionPrefixes.some((prefix) => requiredNextAction.startsWith(prefix))) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
return wrapperResult.silentLaunchOk === true;
|
||||||
|
}
|
||||||
|
|
||||||
function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
||||||
if (!wrapperResult || wrapperResult.classification !== "long_task") {
|
if (!wrapperResult || wrapperResult.classification !== "long_task") {
|
||||||
return { classification: wrapperResult?.classification ?? "general_chat" };
|
return { classification: wrapperResult?.classification ?? "general_chat" };
|
||||||
@@ -124,16 +151,16 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
|||||||
concreteNextAction: requiredNextAction,
|
concreteNextAction: requiredNextAction,
|
||||||
}
|
}
|
||||||
: null;
|
: null;
|
||||||
const progressEvidenceReason = progressEvidence
|
const claimedProgression = shouldClaimProgression(wrapperResult, progressEvidence)
|
||||||
? ""
|
? "already progressing to the next step in background"
|
||||||
: "progression claim requires concrete evidence such as sessionKey, runId, modified_files, or verification result";
|
: "";
|
||||||
|
const progressEvidenceReason = claimedProgression && !progressEvidence
|
||||||
|
? "progression claim requires concrete evidence such as sessionKey, runId, modified_files, or verification result"
|
||||||
|
: "";
|
||||||
const hasExternalizedCheckpointEvidence = wrapperResult.silentLaunchOk === true
|
const hasExternalizedCheckpointEvidence = wrapperResult.silentLaunchOk === true
|
||||||
&& typeof wrapperResult.taskRecord?.task_name === "string"
|
&& typeof wrapperResult.taskRecord?.task_name === "string"
|
||||||
&& wrapperResult.taskRecord.task_name.trim().length > 0;
|
&& wrapperResult.taskRecord.task_name.trim().length > 0;
|
||||||
const hasButtonPathClosureEvidence = needsOwnerDecision && wrapperResult.silentLaunchOk === true;
|
const hasButtonPathClosureEvidence = needsOwnerDecision && wrapperResult.silentLaunchOk === true;
|
||||||
const claimedProgression = wrapperResult.classification === "long_task"
|
|
||||||
? "already progressing to the next step in background"
|
|
||||||
: "";
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
classification: wrapperResult.classification,
|
classification: wrapperResult.classification,
|
||||||
|
|||||||
@@ -190,13 +190,16 @@ function hasProgressEvidence(input) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
function claimsProgressionWithoutEvidence(input) {
|
function claimsProgression(input) {
|
||||||
const progressionClaim = EVIDENCE_FIELDS.progressionClaim
|
const progressionClaim = EVIDENCE_FIELDS.progressionClaim
|
||||||
.map((fieldPath) => getPathValue(input, fieldPath))
|
.map((fieldPath) => getPathValue(input, fieldPath))
|
||||||
.find((value) => hasNonEmptyString(value));
|
.find((value) => hasNonEmptyString(value));
|
||||||
|
|
||||||
if (!hasNonEmptyString(progressionClaim)) return false;
|
return hasNonEmptyString(progressionClaim);
|
||||||
|
}
|
||||||
|
|
||||||
|
function claimsProgressionWithoutEvidence(input) {
|
||||||
|
if (!claimsProgression(input)) return false;
|
||||||
return !hasProgressEvidence(input);
|
return !hasProgressEvidence(input);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -269,8 +272,14 @@ function main() {
|
|||||||
process.stdout.write(JSON.stringify(output, null, args.pretty ? 2 : 0) + '\n');
|
process.stdout.write(JSON.stringify(output, null, args.pretty ? 2 : 0) + '\n');
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
export { evaluateGate };
|
||||||
main();
|
|
||||||
} catch (error) {
|
const isDirectRun = process.argv[1] && fs.realpathSync(process.argv[1]) === fs.realpathSync(new URL(import.meta.url));
|
||||||
fail('CLI_ERROR', error && error.message ? error.message : 'unexpected error');
|
|
||||||
|
if (isDirectRun) {
|
||||||
|
try {
|
||||||
|
main();
|
||||||
|
} catch (error) {
|
||||||
|
fail('CLI_ERROR', error && error.message ? error.message : 'unexpected error');
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|||||||
113
scripts/test_force_recall_long_task_preflight.mjs
Normal file → Executable file
113
scripts/test_force_recall_long_task_preflight.mjs
Normal file → Executable file
@@ -40,6 +40,20 @@ async function runScenario(forceRecall, requestText) {
|
|||||||
return injected;
|
return injected;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
async function withPatchedWrapper(tempContent, callback) {
|
||||||
|
const originalWrapper = await fs.readFile(wrapperPath, 'utf8');
|
||||||
|
await fs.writeFile(wrapperPath, tempContent, 'utf8');
|
||||||
|
try {
|
||||||
|
return await callback();
|
||||||
|
} finally {
|
||||||
|
await fs.writeFile(wrapperPath, originalWrapper, 'utf8');
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
function buildWrapperScript(wrapperResult) {
|
||||||
|
return `#!/usr/bin/env node\nprocess.stdout.write(JSON.stringify(${JSON.stringify(wrapperResult)}, null, 0) + "\\n");\n`;
|
||||||
|
}
|
||||||
|
|
||||||
async function main() {
|
async function main() {
|
||||||
await Promise.all([fs.access(wrapperPath), fs.access(gateLockPath)]);
|
await Promise.all([fs.access(wrapperPath), fs.access(gateLockPath)]);
|
||||||
const { default: forceRecall } = await importTsModule(handlerPath);
|
const { default: forceRecall } = await importTsModule(handlerPath);
|
||||||
@@ -66,21 +80,95 @@ async function main() {
|
|||||||
'reason=silent long-task cannot continue without externalized checkpoint path',
|
'reason=silent long-task cannot continue without externalized checkpoint path',
|
||||||
'reason=claimed execution requires evidence of a concrete next action',
|
'reason=claimed execution requires evidence of a concrete next action',
|
||||||
'reason=owner decision flow must end in button-path, not plain text',
|
'reason=owner decision flow must end in button-path, not plain text',
|
||||||
'reason=claimed progression without concrete progress evidence is forbidden',
|
|
||||||
'requiredEvidence=progressEvidence',
|
|
||||||
'requiredValue=sessionKey, runId, modified_files, verification result, or equivalent concrete progress evidence',
|
|
||||||
'ENFORCEMENT: Hook input should include progressEvidence (or equivalent concrete fields) whenever a progression claim is present.',
|
'ENFORCEMENT: Hook input should include progressEvidence (or equivalent concrete fields) whenever a progression claim is present.',
|
||||||
'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
|
'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
|
||||||
'HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete progress evidence.',
|
'HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.',
|
||||||
'HARD_GATE: If a progression claim exists, the hook input must supply progressEvidence (or equivalent concrete fields) before the claim can pass gate.',
|
|
||||||
'HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.',
|
|
||||||
'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
|
'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
|
||||||
'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
|
'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
|
||||||
];
|
];
|
||||||
|
|
||||||
|
const unexpectedSnippets = [
|
||||||
|
'reason=claimed progression without concrete progress evidence is forbidden',
|
||||||
|
'requiredEvidence=progressEvidence',
|
||||||
|
];
|
||||||
|
|
||||||
for (const snippet of expectedSnippets) {
|
for (const snippet of expectedSnippets) {
|
||||||
assert.match(injected, new RegExp(escapeRegex(snippet)), `missing snippet: ${snippet}`);
|
assert.match(injected, new RegExp(escapeRegex(snippet)), `missing snippet: ${snippet}`);
|
||||||
}
|
}
|
||||||
|
for (const snippet of unexpectedSnippets) {
|
||||||
|
assert.doesNotMatch(injected, new RegExp(escapeRegex(snippet)), `unexpected snippet present: ${snippet}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
const { evaluateGate } = await import(pathToFileURL(gateLockPath).href + `?t=${Date.now()}`);
|
||||||
|
assert.equal(typeof evaluateGate, 'function', 'long_task_gate_lock should export evaluateGate for direct tests');
|
||||||
|
|
||||||
|
const passResult = evaluateGate({
|
||||||
|
classification: 'long_task',
|
||||||
|
claimedExecution: true,
|
||||||
|
concreteNextAction: 'dispatch_follow_up_subagent',
|
||||||
|
progressionClaim: 'already progressing to the next step in background',
|
||||||
|
progressEvidence: { sessionKey: 'task-123' },
|
||||||
|
});
|
||||||
|
assert.equal(passResult.gateStatus, 'pass', 'pass-path should pass with concrete progressEvidence');
|
||||||
|
|
||||||
|
const failResult = evaluateGate({
|
||||||
|
classification: 'long_task',
|
||||||
|
claimedExecution: true,
|
||||||
|
concreteNextAction: 'dispatch_follow_up_subagent',
|
||||||
|
progressionClaim: 'already progressing to the next step in background',
|
||||||
|
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' },
|
||||||
|
});
|
||||||
|
assert.equal(failResult.gateStatus, 'fail', 'fail-path should fail when progressionClaim lacks progressEvidence');
|
||||||
|
assert.match(JSON.stringify(failResult), /progressEvidence/, 'fail-path should require progressEvidence');
|
||||||
|
|
||||||
|
const neutralResult = evaluateGate({
|
||||||
|
classification: 'long_task',
|
||||||
|
claimedExecution: true,
|
||||||
|
concreteNextAction: 'dispatch_follow_up_subagent',
|
||||||
|
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' },
|
||||||
|
});
|
||||||
|
assert.equal(neutralResult.gateStatus, 'pass', 'neutral-path should pass when there is no progression claim');
|
||||||
|
assert.doesNotMatch(JSON.stringify(neutralResult), /progressEvidence/, 'neutral-path should not require progressEvidence');
|
||||||
|
|
||||||
|
const passInjected = await withPatchedWrapper(buildWrapperScript({
|
||||||
|
classification: 'long_task',
|
||||||
|
silentCandidate: true,
|
||||||
|
needsCheckpoint: true,
|
||||||
|
needsSubagent: false,
|
||||||
|
needsOwnerDecision: false,
|
||||||
|
silentLaunchOk: true,
|
||||||
|
silentLaunchReason: 'checkpoint established',
|
||||||
|
requiredNextAction: 'dispatch_follow_up_subagent',
|
||||||
|
taskRecord: { task_name: 'task-123' },
|
||||||
|
handoff: { mode: 'direct_reply' },
|
||||||
|
}), async () => runScenario(forceRecall, requestText));
|
||||||
|
assert.match(passInjected, /gateStatus=pass/, 'hook pass-path should pass when wrapper provides concrete progressEvidence');
|
||||||
|
|
||||||
|
const failInjected = await withPatchedWrapper(buildWrapperScript({
|
||||||
|
classification: 'long_task',
|
||||||
|
silentCandidate: false,
|
||||||
|
needsCheckpoint: false,
|
||||||
|
needsSubagent: false,
|
||||||
|
needsOwnerDecision: false,
|
||||||
|
silentLaunchOk: false,
|
||||||
|
requiredNextAction: 'dispatch_follow_up_subagent',
|
||||||
|
handoff: { mode: 'direct_reply' },
|
||||||
|
}), async () => runScenario(forceRecall, requestText));
|
||||||
|
assert.match(failInjected, /gateStatus=fail/, 'hook fail-path should fail when wrapper claims progression without progressEvidence');
|
||||||
|
assert.match(failInjected, /reason=claimed progression without concrete progress evidence is forbidden/, 'hook fail-path should mention missing progress evidence');
|
||||||
|
|
||||||
|
const neutralInjected = await withPatchedWrapper(buildWrapperScript({
|
||||||
|
classification: 'long_task',
|
||||||
|
silentCandidate: false,
|
||||||
|
needsCheckpoint: false,
|
||||||
|
needsSubagent: false,
|
||||||
|
needsOwnerDecision: false,
|
||||||
|
silentLaunchOk: false,
|
||||||
|
requiredNextAction: 'summarize_findings_for_reply',
|
||||||
|
handoff: { mode: 'direct_reply' },
|
||||||
|
}), async () => runScenario(forceRecall, requestText));
|
||||||
|
assert.match(neutralInjected, /gateStatus=pass/, 'hook neutral-path should pass when wrapper does not claim progression');
|
||||||
|
assert.doesNotMatch(neutralInjected, /reason=claimed progression without concrete progress evidence is forbidden/, 'hook neutral-path should not fail on missing progress evidence without a progression claim');
|
||||||
|
|
||||||
const originalGateLock = await fs.readFile(gateLockPath, 'utf8');
|
const originalGateLock = await fs.readFile(gateLockPath, 'utf8');
|
||||||
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'force-recall-gate-lock-'));
|
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'force-recall-gate-lock-'));
|
||||||
@@ -109,14 +197,15 @@ async function main() {
|
|||||||
assert.match(degradedInjected, new RegExp(escapeRegex(snippet)), `missing degraded snippet: ${snippet}`);
|
assert.match(degradedInjected, new RegExp(escapeRegex(snippet)), `missing degraded snippet: ${snippet}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
const summary = {
|
process.stdout.write(JSON.stringify({
|
||||||
ok: true,
|
ok: true,
|
||||||
checked: expectedSnippets,
|
gatePaths: {
|
||||||
degradedChecked: degradedExpectedSnippets,
|
pass: passResult.gateStatus,
|
||||||
|
fail: failResult.gateStatus,
|
||||||
|
neutral: neutralResult.gateStatus,
|
||||||
|
},
|
||||||
bodyPreview: injected.split('\n').slice(0, 35),
|
bodyPreview: injected.split('\n').slice(0, 35),
|
||||||
};
|
}, null, 2) + '\n');
|
||||||
|
|
||||||
process.stdout.write(JSON.stringify(summary, null, 2) + '\n');
|
|
||||||
}
|
}
|
||||||
|
|
||||||
main().catch((error) => {
|
main().catch((error) => {
|
||||||
|
|||||||
Reference in New Issue
Block a user