Files
reporting-governance-plugin/scripts/test_force_recall_long_task_preflight.mjs

297 lines
14 KiB
JavaScript
Executable File

#!/usr/bin/env node
import assert from 'node:assert/strict';
import fs from 'node:fs/promises';
import os from 'node:os';
import path from 'node:path';
import { pathToFileURL } from 'node:url';
import { stripTypeScriptTypes } from 'node:module';
const __dirname = path.dirname(new URL(import.meta.url).pathname);
const repoRoot = path.resolve(__dirname, '..');
const handlerPath = path.join(repoRoot, 'hooks', 'force-recall', 'handler.ts');
const wrapperPath = path.join(repoRoot, 'scripts', 'long_task_governor_wrapper.mjs');
const gateLockPath = path.join(repoRoot, 'scripts', 'long_task_gate_lock.mjs');
const plannerPath = path.join(repoRoot, 'scripts', 'plan_long_task_auto_chain.mjs');
async function importTsModule(tsPath) {
const source = await fs.readFile(tsPath, 'utf8');
const jsSource = stripTypeScriptTypes(source, { mode: 'strip' });
const dataUrl = `data:text/javascript;charset=utf-8,${encodeURIComponent(jsSource)}\n//# sourceURL=${encodeURIComponent(pathToFileURL(tsPath).href)}`;
return import(dataUrl);
}
function escapeRegex(snippet) {
return snippet.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
}
async function runScenario(forceRecall, requestText) {
const event = {
type: 'message',
action: 'preprocessed',
context: {
workspaceDir: repoRoot,
body: requestText,
bodyForAgent: requestText,
},
};
await forceRecall(event);
const injected = event.context?.bodyForAgent;
assert.equal(typeof injected, 'string', 'event.context.bodyForAgent should be a string after handler runs');
return injected;
}
async function withPatchedWrapper(tempContent, callback) {
const originalWrapper = await fs.readFile(wrapperPath, 'utf8');
await fs.writeFile(wrapperPath, tempContent, 'utf8');
try {
return await callback();
} finally {
await fs.writeFile(wrapperPath, originalWrapper, 'utf8');
}
}
function buildWrapperScript(wrapperResult) {
return `#!/usr/bin/env node\nprocess.stdout.write(JSON.stringify(${JSON.stringify(wrapperResult)}, null, 0) + "\\n");\n`;
}
async function main() {
await Promise.all([fs.access(wrapperPath), fs.access(gateLockPath), fs.access(plannerPath)]);
const { default: forceRecall } = await importTsModule(handlerPath);
assert.equal(typeof forceRecall, 'function', 'force-recall handler should export default function');
const requestText = [
'Please inspect the workspace files and verify the hook injection path.',
'I need you to review the behavior, choose the final accept/reject decision,',
'and continue in background with a follow-up later.',
].join(' ');
const injected = await runScenario(forceRecall, requestText);
const expectedSnippets = [
'[LONG_TASK_GOVERNOR_PREFLIGHT]',
'classification=long_task',
'silentLaunchOk=false',
'handoff.mode=button_path',
'[LONG_TASK_GATE_LOCK]',
'gateStatus=fail',
'[LONG_TASK_AUTO_CHAIN_PLAN]',
'plannerStatus=blocked_by_gate',
'derivedAction=none',
'dispatchMode=no_dispatch',
'autoChainAllowed=false',
'reason=gateStatus must pass before auto-chain planning can proceed',
'requiredEvidence=gateStatus=pass',
'requiredEvidence=externalizedCheckpoint',
'requiredEvidence=concreteNextAction',
'requiredEvidence=buttonPathMode',
'reason=silent long-task cannot continue without externalized checkpoint path',
'reason=claimed execution requires evidence of a concrete next action',
'reason=owner decision flow must end in button-path, not plain text',
'ENFORCEMENT: Hook input should include progressEvidence (or equivalent concrete fields) whenever a progression claim is present.',
'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
'HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.',
'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
];
const unexpectedSnippets = [
'reason=claimed progression without concrete progress evidence is forbidden',
'requiredEvidence=progressEvidence',
];
for (const snippet of expectedSnippets) {
assert.match(injected, new RegExp(escapeRegex(snippet)), `missing snippet: ${snippet}`);
}
for (const snippet of unexpectedSnippets) {
assert.doesNotMatch(injected, new RegExp(escapeRegex(snippet)), `unexpected snippet present: ${snippet}`);
}
const { evaluateGate } = await import(pathToFileURL(gateLockPath).href + `?t=${Date.now()}`);
assert.equal(typeof evaluateGate, 'function', 'long_task_gate_lock should export evaluateGate for direct tests');
const passResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
autoChainDispatchEvidence: {
action: 'dispatch_follow_up_subagent',
dispatched: true,
event: 'dispatch',
},
progressionClaim: 'already progressing to the next step in background',
progressEvidence: { sessionKey: 'task-123' },
});
assert.equal(passResult.gateStatus, 'pass', 'pass-path should pass with concrete progressEvidence');
const failResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
progressionClaim: 'already progressing to the next step in background',
executionEvidence: { concreteNextAction: 'dispatch_follow_up_subagent' },
});
assert.equal(failResult.gateStatus, 'fail', 'fail-path should fail when explicit auto-chain action lacks dispatch evidence');
assert.match(JSON.stringify(failResult), /autoChainDispatchEvidence/, 'fail-path should require autoChainDispatchEvidence');
const neutralResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'summarize findings for reply',
executionEvidence: { concreteNextAction: 'summarize findings for reply' },
});
assert.equal(neutralResult.gateStatus, 'pass', 'neutral-path should pass when there is no explicit auto-chain next action');
assert.doesNotMatch(JSON.stringify(neutralResult), /autoChainDispatchEvidence/, 'neutral-path should not require auto-chain dispatch evidence');
const directAutoChainFailResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
});
assert.equal(directAutoChainFailResult.gateStatus, 'fail', 'direct evaluator should fail when explicit auto-chain action has no dispatch evidence');
assert.match(JSON.stringify(directAutoChainFailResult), /explicit auto-chain next action requires dispatched-action evidence/, 'direct evaluator fail-path should mention missing dispatched-action evidence');
const mismatchedDispatchEvidenceResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
autoChainDispatchEvidence: {
action: 'dispatch_other_subagent',
dispatched: true,
event: 'dispatch',
},
});
assert.equal(mismatchedDispatchEvidenceResult.gateStatus, 'fail', 'mismatched dispatch evidence should fail');
assert.match(JSON.stringify(mismatchedDispatchEvidenceResult), /autoChainDispatchEvidence/, 'mismatched dispatch evidence should still require matching autoChainDispatchEvidence');
const fakeCheckpointDispatchEvidenceResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'dispatch_follow_up_subagent',
autoChainNextAction: 'dispatch_follow_up_subagent',
autoChainDispatchEvidence: {
sessionKey: 'task-123',
checkpointPath: 'checkpoints/task-123.json',
},
});
assert.equal(fakeCheckpointDispatchEvidenceResult.gateStatus, 'fail', 'checkpoint/session-only dispatch evidence should fail');
assert.match(JSON.stringify(fakeCheckpointDispatchEvidenceResult), /explicit auto-chain next action requires dispatched-action evidence/, 'checkpoint/session-only dispatch evidence should be rejected as fake dispatch evidence');
const neutralSnakeCaseResult = evaluateGate({
classification: 'long_task',
claimedExecution: true,
concreteNextAction: 'summarize findings for reply',
autoChainNextAction: 'checkpoint_session_metadata_only',
executionEvidence: { concreteNextAction: 'summarize findings for reply' },
});
assert.equal(neutralSnakeCaseResult.gateStatus, 'pass', 'neutral snake_case non-dispatch action should not trigger dispatch-evidence requirement');
assert.doesNotMatch(JSON.stringify(neutralSnakeCaseResult), /autoChainDispatchEvidence/, 'neutral snake_case non-dispatch action should not mention dispatch-evidence requirement');
const passInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task',
silentCandidate: true,
needsCheckpoint: true,
needsSubagent: false,
needsOwnerDecision: false,
silentLaunchOk: true,
silentLaunchReason: 'checkpoint established',
requiredNextAction: 'dispatch_follow_up_subagent',
autoChainDispatchEvidence: {
action: 'dispatch_follow_up_subagent',
dispatched: true,
event: 'dispatch',
},
taskRecord: { task_name: 'task-123' },
handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText));
assert.match(passInjected, /gateStatus=pass/, 'hook pass-path should pass when wrapper provides concrete progressEvidence');
assert.match(passInjected, /\[LONG_TASK_AUTO_CHAIN_PLAN\]/, 'hook pass-path should emit auto-chain plan block');
assert.match(passInjected, /plannerStatus=pass/, 'hook pass-path should expose planner pass result');
assert.match(passInjected, /derivedAction=dispatch_spec_review/, 'hook pass-path should derive dry-run spec review dispatch');
assert.match(passInjected, /dispatchMode=dry_run_dispatch/, 'hook pass-path should stay in dry-run dispatch mode');
assert.match(passInjected, /autoChainAllowed=true/, 'hook pass-path should allow auto-chain in dry-run planner output');
const failInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task',
silentCandidate: false,
needsCheckpoint: false,
needsSubagent: false,
needsOwnerDecision: false,
silentLaunchOk: false,
requiredNextAction: 'dispatch_follow_up_subagent',
handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText));
assert.match(failInjected, /gateStatus=fail/, 'hook fail-path should fail when wrapper exposes explicit auto-chain action without dispatch evidence');
assert.match(failInjected, /\[LONG_TASK_AUTO_CHAIN_PLAN\]/, 'hook fail-path should emit auto-chain plan block');
assert.match(failInjected, /plannerStatus=blocked_by_gate/, 'hook fail-path should report planner blocked by gate');
assert.match(failInjected, /derivedAction=none/, 'hook fail-path should not derive a dry-run action');
assert.match(failInjected, /dispatchMode=no_dispatch/, 'hook fail-path should remain no-dispatch');
assert.match(failInjected, /autoChainAllowed=false/, 'hook fail-path should not allow auto-chain');
assert.match(failInjected, /reason=explicit auto-chain next action requires dispatched-action evidence/, 'hook fail-path should mention missing dispatched-action evidence');
assert.match(failInjected, /requiredEvidence=autoChainDispatchEvidence/, 'hook fail-path should require autoChainDispatchEvidence');
const neutralInjected = await withPatchedWrapper(buildWrapperScript({
classification: 'long_task',
silentCandidate: false,
needsCheckpoint: false,
needsSubagent: false,
needsOwnerDecision: false,
silentLaunchOk: false,
requiredNextAction: 'summarize findings for reply',
handoff: { mode: 'direct_reply' },
}), async () => runScenario(forceRecall, requestText));
assert.match(neutralInjected, /gateStatus=pass/, 'hook neutral-path should pass when wrapper does not expose an explicit auto-chain action');
assert.match(neutralInjected, /\[LONG_TASK_AUTO_CHAIN_PLAN\]/, 'hook neutral-path should emit auto-chain plan block');
assert.match(neutralInjected, /plannerStatus=none/, 'hook neutral-path should report no derived auto-chain action');
assert.match(neutralInjected, /derivedAction=none/, 'hook neutral-path should keep derivedAction as none');
assert.match(neutralInjected, /dispatchMode=no_dispatch/, 'hook neutral-path should remain no-dispatch');
assert.match(neutralInjected, /autoChainAllowed=false/, 'hook neutral-path should keep auto-chain disabled');
assert.doesNotMatch(neutralInjected, /reason=explicit auto-chain next action requires dispatched-action evidence/, 'hook neutral-path should not fail on auto-chain evidence when no explicit tool action exists');
const originalGateLock = await fs.readFile(gateLockPath, 'utf8');
const tempDir = await fs.mkdtemp(path.join(os.tmpdir(), 'force-recall-gate-lock-'));
const backupPath = path.join(tempDir, path.basename(gateLockPath));
await fs.writeFile(backupPath, originalGateLock, 'utf8');
await fs.writeFile(gateLockPath, '#!/usr/bin/env node\nprocess.exit(1);\n', 'utf8');
let degradedInjected;
try {
degradedInjected = await runScenario(forceRecall, requestText);
} finally {
const backup = await fs.readFile(backupPath, 'utf8');
await fs.writeFile(gateLockPath, backup, 'utf8');
await fs.rm(tempDir, { recursive: true, force: true });
}
const degradedExpectedSnippets = [
'[LONG_TASK_GATE_LOCK]',
'gateStatus=degraded',
'gateRequired=unknown',
'HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable progress evidence.',
'HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.',
];
for (const snippet of degradedExpectedSnippets) {
assert.match(degradedInjected, new RegExp(escapeRegex(snippet)), `missing degraded snippet: ${snippet}`);
}
process.stdout.write(JSON.stringify({
ok: true,
gatePaths: {
pass: passResult.gateStatus,
fail: failResult.gateStatus,
neutral: neutralResult.gateStatus,
},
bodyPreview: injected.split('\n').slice(0, 35),
}, null, 2) + '\n');
}
main().catch((error) => {
console.error(error);
process.exitCode = 1;
});