v0.2

2026-05-18 14:05:37 +03:00
parent 3da6362c49
commit ed1d44e94a
16 changed files with 1686 additions and 867 deletions
--- a/resources/guardrails.yaml
+++ b/resources/guardrails.yaml
@@ -1,37 +1,61 @@
-# UAPF-IP guardrails policy for the Drone Threat Public Address process.
-# Loaded by the runtime at session start as an immutable policy snapshot;
-# enforced by the host before every capability invocation (see UAPF-IP SPEC §10).
+kind: uapf.guardrails
+version: "0.3.0"

-scope:
-  domain: civil-defence
-  classification: restricted
-  legal_basis: national-civil-protection-law
+# UAPF-IP guardrails enforced at every capability call made by a UAPF-IP host
+# running this package. Guardrails are evaluated before and after capability
+# execution; a violation blocks the call and is logged to the audit trail.

-ai:
-  allowed_models:
-    - claude-opus-4-7
-    - claude-sonnet-4-6
-  forbidden_inputs:
-    - classified-radar-tracks
-    - personal-data-of-residents
-    - source-identifying-osint
-  pii_redactor: required
-  # AI may never autonomously complete a step in a national public-warning
-  # process; a confidence ceiling of 0 forces human confirmation every time.
-  max_confidence_for_autocomplete: 0.0
-  decisions_ai_may_make_unattended: []
-  decisions_ai_may_recommend:
-    - air_threat_severity
-    - disinformation_classification
+principles:
+  - id: GR-1
+    name: AI is advisory, never authoritative
+    rule: >-
+      AI agents and MCP tools (ai.classify, ai.complete) may only produce
+      recommendations. Object classification, threat severity, public-broadcast
+      authorisation, interception authorisation and origin attribution must be
+      confirmed by an accountable human role before any downstream effect.
+    appliesTo: [Decision_ObjectClassification, Decision_AirThreatSeverity,
+                Decision_InterceptionAuthorization, HT_OriginAttribution]

-human_oversight:
-  required_for:
-    - cell_broadcast_dispatch
-    - interception_authorization
-    - public_communication
-    - incident_closure
-  reviewer_role: nbs-joint-staff-duty-officer
+  - id: GR-2
+    name: No autonomous use of force
+    rule: >-
+      No capability may move the process to an interception "engage" outcome
+      without a recorded manual authorisation by NBS Joint Staff. The
+      Decision_InterceptionAuthorization table is decision support only.
+    appliesTo: [Task_AuthorizeInterception, Task_GroundEngage, Task_RequestBAP]

-audit:
-  vc_signature_per_step: required
-  retention_years: 10
+  - id: GR-3
+    name: Notification SLA is a hard floor
+    rule: >-
+      broadcastSlaSeconds from Decision_NotificationUrgency is a maximum, not a
+      target. The SLA-breach boundary timer must remain enabled; disabling it
+      is a safeguard removal and must be recorded.
+    appliesTo: [Task_DispatchBroadcast, Boundary_SlaBreach]
+
+  - id: GR-4
+    name: Public messaging is human-authored
+    rule: >-
+      Cell-broadcast and all-clear messages are selected from pre-approved,
+      human-authored templates (MSG_*). Free-text generation of public alert
+      content by an AI agent is prohibited.
+    appliesTo: [Task_DispatchBroadcast, Task_AllClearBroadcast, HT_PressBriefing]
+
+  - id: GR-5
+    name: Auditability
+    rule: >-
+      Every decision evaluation records its inputs, the matched rule id and the
+      output, with actor identity and timestamp, retained per metadata/policies.yaml.
+    appliesTo: ["*"]
+
+  - id: GR-6
+    name: Data minimisation in disinformation monitoring
+    rule: >-
+      The OSINT monitoring tool processes only public posts; it must not ingest
+      or store personal data of identifiable citizens beyond what is required
+      to classify a post.
+    appliesTo: [HT_DisinfoMonitor]
+
+enforcement:
+  onViolation: block
+  audit: required
+  reviewAuthority: "Ministry of Defence (algorithm governance)"