feat: require concrete evidence for progress claims
This commit is contained in:
@@ -85,6 +85,19 @@ async function runLongTaskWrapper(workspaceDir: string, ctx: any): Promise<any |
|
|||||||
return runJsonScript(wrapperPath, workspaceDir, input, LONG_TASK_WRAPPER_TIMEOUT_MS);
|
return runJsonScript(wrapperPath, workspaceDir, input, LONG_TASK_WRAPPER_TIMEOUT_MS);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function buildProgressEvidence(wrapperResult: any): Record<string, unknown> | null {
|
||||||
|
const progressEvidence: Record<string, unknown> = {};
|
||||||
|
|
||||||
|
const taskName = typeof wrapperResult?.taskRecord?.task_name === "string"
|
||||||
|
? wrapperResult.taskRecord.task_name.trim()
|
||||||
|
: "";
|
||||||
|
if (wrapperResult?.silentLaunchOk === true && taskName) {
|
||||||
|
progressEvidence.sessionKey = taskName;
|
||||||
|
}
|
||||||
|
|
||||||
|
return Object.keys(progressEvidence).length > 0 ? progressEvidence : null;
|
||||||
|
}
|
||||||
|
|
||||||
function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
||||||
if (!wrapperResult || wrapperResult.classification !== "long_task") {
|
if (!wrapperResult || wrapperResult.classification !== "long_task") {
|
||||||
return { classification: wrapperResult?.classification ?? "general_chat" };
|
return { classification: wrapperResult?.classification ?? "general_chat" };
|
||||||
@@ -92,6 +105,7 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
|||||||
|
|
||||||
const needsOwnerDecision = wrapperResult.needsOwnerDecision === true;
|
const needsOwnerDecision = wrapperResult.needsOwnerDecision === true;
|
||||||
const silentCandidate = wrapperResult.silentCandidate === true;
|
const silentCandidate = wrapperResult.silentCandidate === true;
|
||||||
|
const progressEvidence = buildProgressEvidence(wrapperResult);
|
||||||
const requiredNextAction = typeof wrapperResult.requiredNextAction === "string"
|
const requiredNextAction = typeof wrapperResult.requiredNextAction === "string"
|
||||||
? wrapperResult.requiredNextAction.trim()
|
? wrapperResult.requiredNextAction.trim()
|
||||||
: "";
|
: "";
|
||||||
@@ -130,6 +144,11 @@ function buildGateLockInput(wrapperResult: any): Record<string, unknown> {
|
|||||||
claimedProgression: claimedProgression,
|
claimedProgression: claimedProgression,
|
||||||
statusSummary: claimedProgression,
|
statusSummary: claimedProgression,
|
||||||
executionEvidence,
|
executionEvidence,
|
||||||
|
progressEvidence,
|
||||||
|
sessionKey: typeof progressEvidence?.sessionKey === "string" ? progressEvidence.sessionKey : "",
|
||||||
|
runId: typeof progressEvidence?.runId === "string" ? progressEvidence.runId : "",
|
||||||
|
modified_files: Array.isArray(progressEvidence?.modified_files) ? progressEvidence.modified_files : [],
|
||||||
|
verificationResult: typeof progressEvidence?.verificationResult === "string" ? progressEvidence.verificationResult : "",
|
||||||
toolCallEvidence: "",
|
toolCallEvidence: "",
|
||||||
dispatchEvidence: "",
|
dispatchEvidence: "",
|
||||||
fileChangeEvidence: "",
|
fileChangeEvidence: "",
|
||||||
@@ -201,8 +220,8 @@ function buildGateLockBlock(gateLockResult: GateLockResult | null): string {
|
|||||||
"gateStatus=degraded",
|
"gateStatus=degraded",
|
||||||
"gateRequired=unknown",
|
"gateRequired=unknown",
|
||||||
"- ENFORCEMENT: Gate-lock evaluator unavailable; keep existing long-task safeguards in force.",
|
"- ENFORCEMENT: Gate-lock evaluator unavailable; keep existing long-task safeguards in force.",
|
||||||
"- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete evidence such as actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
|
"- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete progress evidence such as a sessionKey, runId, modified_files record, verification result, actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
|
||||||
"- HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable evidence.",
|
"- HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable progress evidence.",
|
||||||
"- HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.",
|
"- HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.",
|
||||||
"[/LONG_TASK_GATE_LOCK]",
|
"[/LONG_TASK_GATE_LOCK]",
|
||||||
"",
|
"",
|
||||||
@@ -219,15 +238,15 @@ function buildGateLockBlock(gateLockResult: GateLockResult | null): string {
|
|||||||
return `requiredEvidence=${requirement.evidenceKey ?? "unknown"};fields=${fields};requiredValue=${requirement.requiredValue ?? "unknown"}`;
|
return `requiredEvidence=${requirement.evidenceKey ?? "unknown"};fields=${fields};requiredValue=${requirement.requiredValue ?? "unknown"}`;
|
||||||
})),
|
})),
|
||||||
...((gateLockResult.allowedResponseModes ?? []).map((mode) => `allowedResponseMode=${mode}`)),
|
...((gateLockResult.allowedResponseModes ?? []).map((mode) => `allowedResponseMode=${mode}`)),
|
||||||
"- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete evidence such as actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
|
"- ENFORCEMENT: Do not claim you have progressed into the next task or are already pushing the next step unless you have concrete progress evidence such as a sessionKey, runId, modified_files record, verification result, actual dispatch, tool calls, file changes, or a persisted checkpoint artifact.",
|
||||||
"- ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.",
|
"- ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.",
|
||||||
"- ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.",
|
"- ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.",
|
||||||
];
|
];
|
||||||
|
|
||||||
if (gateLockResult.gateStatus === "fail") {
|
if (gateLockResult.gateStatus === "fail") {
|
||||||
lines.push("- HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.");
|
lines.push("- HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.");
|
||||||
lines.push("- HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete execution evidence.");
|
lines.push("- HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete progress evidence.");
|
||||||
lines.push("- HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to actual tool execution, file changes, emitted messages, or checkpoint records.");
|
lines.push("- HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.");
|
||||||
lines.push("- HARD_GATE: If required evidence is missing, ask for/produce the checkpoint or downgrade to a non-silent, evidence-preserving follow-up.");
|
lines.push("- HARD_GATE: If required evidence is missing, ask for/produce the checkpoint or downgrade to a non-silent, evidence-preserving follow-up.");
|
||||||
lines.push("- HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.");
|
lines.push("- HARD_GATE: If owner decision is involved, do not replace button-path closure with plain-text handoff.");
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -30,6 +30,17 @@ const EVIDENCE_FIELDS = Object.freeze({
|
|||||||
'verificationEvidence',
|
'verificationEvidence',
|
||||||
'checkpointArtifactEvidence',
|
'checkpointArtifactEvidence',
|
||||||
]),
|
]),
|
||||||
|
progressEvidence: Object.freeze([
|
||||||
|
'progressEvidence',
|
||||||
|
'progressEvidence.sessionKey',
|
||||||
|
'progressEvidence.runId',
|
||||||
|
'progressEvidence.modified_files',
|
||||||
|
'progressEvidence.verificationResult',
|
||||||
|
'sessionKey',
|
||||||
|
'runId',
|
||||||
|
'modified_files',
|
||||||
|
'verificationResult',
|
||||||
|
]),
|
||||||
});
|
});
|
||||||
|
|
||||||
const GATE_REQUIREMENTS = Object.freeze({
|
const GATE_REQUIREMENTS = Object.freeze({
|
||||||
@@ -53,6 +64,11 @@ const GATE_REQUIREMENTS = Object.freeze({
|
|||||||
acceptedFields: EVIDENCE_FIELDS.executionEvidence,
|
acceptedFields: EVIDENCE_FIELDS.executionEvidence,
|
||||||
requiredValue: 'tool call, dispatch, file change, verification output, or checkpoint artifact evidence',
|
requiredValue: 'tool call, dispatch, file change, verification output, or checkpoint artifact evidence',
|
||||||
}),
|
}),
|
||||||
|
progressEvidence: Object.freeze({
|
||||||
|
evidenceKey: 'progressEvidence',
|
||||||
|
acceptedFields: EVIDENCE_FIELDS.progressEvidence,
|
||||||
|
requiredValue: 'sessionKey, runId, modified_files, verification result, or equivalent concrete progress evidence',
|
||||||
|
}),
|
||||||
});
|
});
|
||||||
|
|
||||||
function fail(code, message) {
|
function fail(code, message) {
|
||||||
@@ -164,6 +180,16 @@ function hasExecutionEvidence(input) {
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function hasProgressEvidence(input) {
|
||||||
|
return EVIDENCE_FIELDS.progressEvidence.some((fieldPath) => {
|
||||||
|
const value = getPathValue(input, fieldPath);
|
||||||
|
if (hasNonEmptyString(value)) return true;
|
||||||
|
if (Array.isArray(value)) return value.length > 0;
|
||||||
|
if (value && typeof value === 'object') return Object.keys(value).length > 0;
|
||||||
|
return false;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
function claimsProgressionWithoutEvidence(input) {
|
function claimsProgressionWithoutEvidence(input) {
|
||||||
const progressionClaim = EVIDENCE_FIELDS.progressionClaim
|
const progressionClaim = EVIDENCE_FIELDS.progressionClaim
|
||||||
.map((fieldPath) => getPathValue(input, fieldPath))
|
.map((fieldPath) => getPathValue(input, fieldPath))
|
||||||
@@ -171,7 +197,7 @@ function claimsProgressionWithoutEvidence(input) {
|
|||||||
|
|
||||||
if (!hasNonEmptyString(progressionClaim)) return false;
|
if (!hasNonEmptyString(progressionClaim)) return false;
|
||||||
|
|
||||||
return !hasExecutionEvidence(input);
|
return !(hasProgressEvidence(input) || hasExecutionEvidence(input));
|
||||||
}
|
}
|
||||||
|
|
||||||
function evaluateGate(input) {
|
function evaluateGate(input) {
|
||||||
@@ -215,8 +241,8 @@ function evaluateGate(input) {
|
|||||||
|
|
||||||
if (claimsProgressionWithoutEvidence(input)) {
|
if (claimsProgressionWithoutEvidence(input)) {
|
||||||
failed = true;
|
failed = true;
|
||||||
reasons.push('claimed progression without concrete execution evidence is forbidden');
|
reasons.push('claimed progression without concrete progress evidence is forbidden');
|
||||||
requiredEvidence.push(describeRequirement(GATE_REQUIREMENTS.executionEvidence));
|
requiredEvidence.push(describeRequirement(GATE_REQUIREMENTS.progressEvidence));
|
||||||
allowedResponseModes.push('evidence_preserving_follow_up');
|
allowedResponseModes.push('evidence_preserving_follow_up');
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -66,11 +66,12 @@ async function main() {
|
|||||||
'reason=silent long-task cannot continue without externalized checkpoint path',
|
'reason=silent long-task cannot continue without externalized checkpoint path',
|
||||||
'reason=claimed execution requires evidence of a concrete next action',
|
'reason=claimed execution requires evidence of a concrete next action',
|
||||||
'reason=owner decision flow must end in button-path, not plain text',
|
'reason=owner decision flow must end in button-path, not plain text',
|
||||||
'reason=claimed progression without concrete execution evidence is forbidden',
|
'reason=claimed progression without concrete progress evidence is forbidden',
|
||||||
'requiredEvidence=executionEvidence',
|
'requiredEvidence=progressEvidence',
|
||||||
|
'requiredValue=sessionKey, runId, modified_files, verification result, or equivalent concrete progress evidence',
|
||||||
'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
|
'HARD_GATE: Block any plain-text handoff or silent-continuation claim when externalized checkpoint evidence is missing.',
|
||||||
'HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete execution evidence.',
|
'HARD_GATE: Block any reply path that says you already moved into the next task or are advancing the next step without concrete progress evidence.',
|
||||||
'HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to actual tool execution, file changes, emitted messages, or checkpoint records.',
|
'HARD_GATE: Do not say you are already on the next task, already dispatched follow-up work, or already progressing in background unless you can point to a sessionKey, runId, modified_files record, verification result, actual tool execution, file changes, emitted messages, or checkpoint records.',
|
||||||
'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
|
'ENFORCEMENT: Forbidden path: plain-text handoff that pretends the long task is already continuing without an externalized checkpoint.',
|
||||||
'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
|
'ENFORCEMENT: Forbidden path: stating you have already entered the next task/step when the record only contains planning language and no concrete execution evidence.',
|
||||||
];
|
];
|
||||||
@@ -98,7 +99,7 @@ async function main() {
|
|||||||
'[LONG_TASK_GATE_LOCK]',
|
'[LONG_TASK_GATE_LOCK]',
|
||||||
'gateStatus=degraded',
|
'gateStatus=degraded',
|
||||||
'gateRequired=unknown',
|
'gateRequired=unknown',
|
||||||
'HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable evidence.',
|
'HARD_GATE: Evaluator unavailable is not permission to claim silent continuation or next-task progression without verifiable progress evidence.',
|
||||||
'HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.',
|
'HARD_GATE: Fall back to a non-silent, evidence-preserving follow-up if you cannot prove checkpoint state or concrete execution.',
|
||||||
];
|
];
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user