From ae0c6460218c575b048d3d291f2a759ce92474e3 Mon Sep 17 00:00:00 2001 From: Rihards Date: Sat, 16 May 2026 09:32:55 +0000 Subject: [PATCH] v1.0.0: dev.uapf.semantic-document-analysis MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit UAPF v1.1 SSOT-conformant Level 4 process package — reusable semantic document analysis, shareable across DMS / intake / mailroom systems. Structure: - uapf.yaml (kind: uapf.package, level 4) + manifest.json engine-compat - bpmn/semantic-document-analysis.bpmn.xml — 3 service tasks invoking reserved UAPF-IP capabilities ai.redact@1, ai.extract@1, event.emit@1 - resources/mappings.yaml — task->target bindings with I/O contracts - resources/schemas/vdvc-semantic-summary.schema.json — output contract - resources/guardrails.yaml — GDPR + EU AI Act constraints - metadata/ownership.yaml + metadata/lifecycle.yaml - docs/, fixtures/, tests/eval-set.json Validates clean against UAPFormat/UAPF-specification schemas. --- .gitignore | 3 + README.md | 19 +++++++ bpmn/semantic-document-analysis.bpmn.xml | 55 +++++++++++++++++++ docs/00-overview.md | 32 +++++++++++ docs/01-eu-ai-act.md | 14 +++++ docs/02-integration.md | 21 +++++++ fixtures/child-rights-input.json | 3 + fixtures/discrimination-input.json | 3 + manifest.json | 45 +++++++++++++++ metadata/lifecycle.yaml | 9 +++ metadata/ownership.yaml | 9 +++ resources/guardrails.yaml | 32 +++++++++++ resources/mappings.yaml | 54 ++++++++++++++++++ .../schemas/vdvc-semantic-summary.schema.json | 49 +++++++++++++++++ tests/eval-set.json | 24 ++++++++ uapf.yaml | 50 +++++++++++++++++ 16 files changed, 422 insertions(+) create mode 100644 .gitignore create mode 100644 README.md create mode 100644 bpmn/semantic-document-analysis.bpmn.xml create mode 100644 docs/00-overview.md create mode 100644 docs/01-eu-ai-act.md create mode 100644 docs/02-integration.md create mode 100644 fixtures/child-rights-input.json create mode 100644 fixtures/discrimination-input.json create mode 100644 manifest.json create mode 100644 metadata/lifecycle.yaml create mode 100644 metadata/ownership.yaml create mode 100644 resources/guardrails.yaml create mode 100644 resources/mappings.yaml create mode 100644 resources/schemas/vdvc-semantic-summary.schema.json create mode 100644 tests/eval-set.json create mode 100644 uapf.yaml diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..07a0765 --- /dev/null +++ b/.gitignore @@ -0,0 +1,3 @@ +*.uapf +__pycache__/ +.DS_Store diff --git a/README.md b/README.md new file mode 100644 index 0000000..2758e5c --- /dev/null +++ b/README.md @@ -0,0 +1,19 @@ +# dev.uapf.semantic-document-analysis + +UAPF v1.1 SSOT-conformant Level 4 process package providing reusable +semantic document analysis as `Process_SemanticDocumentAnalysis`. + +See `docs/00-overview.md` for what it does, `docs/01-eu-ai-act.md` for +the regulatory analysis, and `docs/02-integration.md` for runtime +integration notes. + +## Validates against + +Run `jsonschema -i ` against +the canonical schemas in +`github.com/UAPFormat/UAPF-specification/schemas/`: + +- `uapf-manifest.schema.json` (root manifest) +- `ownership.schema.json` (metadata/ownership.yaml) +- `lifecycle.schema.json` (metadata/lifecycle.yaml) +- `resource-mapping.schema.json` (resources/mappings.yaml) diff --git a/bpmn/semantic-document-analysis.bpmn.xml b/bpmn/semantic-document-analysis.bpmn.xml new file mode 100644 index 0000000..60214ed --- /dev/null +++ b/bpmn/semantic-document-analysis.bpmn.xml @@ -0,0 +1,55 @@ + + + + + + + + + + Calls ai.redact@1 to mask names, identifiers, addresses, financial + and health data before downstream extraction. Required by + resources/guardrails.yaml (GDPR Art. 5 minimisation). + + + + + + Calls ai.extract@1 with the redacted text and the VDVC v1.1 output + schema (resources/schemas/vdvc-semantic-summary.schema.json). The + host's AI agent must produce output that validates against that + schema. Output records aiModelVersion + aiConfidenceScore per + EU AI Act Art. 13. + + + + + + Calls event.emit@1 to publish a CloudEvent containing the extracted + semantic summary. Downstream processes consume this event. + + + + + + + + + + + + diff --git a/docs/00-overview.md b/docs/00-overview.md new file mode 100644 index 0000000..421ee3a --- /dev/null +++ b/docs/00-overview.md @@ -0,0 +1,32 @@ +# dev.uapf.semantic-document-analysis — Overview + +**UAPF v1.1 SSOT-conformant** Level 4 process package providing +reusable semantic document analysis. + +## What + +A 3-step BPMN process that, given free-text document content: + +1. Redacts PII via `ai.redact@1` +2. Extracts VDVC v1.1 structured semantic metadata via `ai.extract@1` +3. Emits `document.semantic-analysis.completed.v1` CloudEvent via `event.emit@1` + +## What's portable + +The package ships: +- The BPMN flow (the algorithm shape) +- The VDVC output JSON Schema (the output contract) +- The resource mapping (input/output contracts, timeouts, retries) +- The guardrails policy (GDPR + EU AI Act constraints) + +The host system supplies the actual AI agent that fulfils the three +capabilities. Multiple hosts can implement the same capabilities; +multiple packages can require the same capabilities. + +## How to consume + +Drop this `.uapf` into any UAPF-conformant runtime. The runtime +exposes `uapf.run_process` (per UAPF-specification §6.3.1) targeting +`Process_SemanticDocumentAnalysis`. The runtime resolves the resource +mapping to find a target with the three required capabilities and +invokes them in order per the BPMN flow. diff --git a/docs/01-eu-ai-act.md b/docs/01-eu-ai-act.md new file mode 100644 index 0000000..66aa1b4 --- /dev/null +++ b/docs/01-eu-ai-act.md @@ -0,0 +1,14 @@ +# EU AI Act analysis + +**Classification:** Annex III §5(a) and §8(a) — high-risk AI system +under Regulation 2024/1689 when used for triage in public-service or +justice-administration contexts. + +**Operator obligations** (the deploying organisation, not this package): + +- Art. 13 transparency — output records `summarySource`, `aiConfidenceScore`, `aiModelVersion` +- Art. 14 human oversight — `humanValidationStatus` blocks consequential action +- Art. 9 risk management — see `resources/guardrails.yaml` +- Art. 12 logging — host MUST log invocation with package version + model id + +See `resources/guardrails.yaml` for the machine-readable policy. diff --git a/docs/02-integration.md b/docs/02-integration.md new file mode 100644 index 0000000..8b8851c --- /dev/null +++ b/docs/02-integration.md @@ -0,0 +1,21 @@ +# Integration with UAPF-conformant runtimes + +This package validates against the UAPF v1.1 SSOT schemas: +- `uapf.yaml` → `uapf-manifest.schema.json` +- `metadata/ownership.yaml` → `ownership.schema.json` +- `metadata/lifecycle.yaml` → `lifecycle.schema.json` +- `resources/mappings.yaml` → `resource-mapping.schema.json` + +It conforms to UAPF Conformance Checklist §"Level 4 requirements": +- ✓ uapf.yaml present +- ✓ ≥1 BPMN file (`bpmn/semantic-document-analysis.bpmn.xml`) +- ✓ `resources/mappings.yaml` present +- ✓ `metadata/ownership.yaml` + `metadata/lifecycle.yaml` present +- ✓ Cornerstones declared correctly (bpmn + resources) + +## Engine compatibility notes + +`uapf-engine@1.0.0` only reads `manifest.json` (pre-v1.1 file naming). +This package ships both `uapf.yaml` (SSOT-mandated) and `manifest.json` +(byte-identical-semantics JSON copy) for compatibility until upstream +engine supports YAML. diff --git a/fixtures/child-rights-input.json b/fixtures/child-rights-input.json new file mode 100644 index 0000000..7ea1d47 --- /dev/null +++ b/fixtures/child-rights-input.json @@ -0,0 +1,3 @@ +{ + "text": "Vēršos Tiesībsarga birojā par bāriņtiesas lēmumu attiecībā uz manu mazdēlu (vecums 7 gadi). Lēmums pieņemts steidzamā kārtībā 2026. gada martā. Lūdzu izvērtēt rīcības atbilstību Bērnu tiesību aizsardzības likumam." +} diff --git a/fixtures/discrimination-input.json b/fixtures/discrimination-input.json new file mode 100644 index 0000000..e768e8f --- /dev/null +++ b/fixtures/discrimination-input.json @@ -0,0 +1,3 @@ +{ + "text": "Iesniedzējs ar otrās grupas invaliditāti norāda, ka valsts iestādes darba intervijā atklāti pateikts: 'nevaram pieņemt cilvēkus ar īpašām vajadzībām'. Lūdz Tiesībsargu izmeklēt." +} diff --git a/manifest.json b/manifest.json new file mode 100644 index 0000000..9733923 --- /dev/null +++ b/manifest.json @@ -0,0 +1,45 @@ +{ + "kind": "uapf.package", + "id": "dev.uapf.semantic-document-analysis", + "name": "Semantic Document Analysis (UAPF reference algorithm)", + "description": "Level-4 UAPF process for extracting VDVC-conformant semantic metadata\n(topic, summary, urgency, risk, sensitivity) from a free-text document.\n\nPortable across document management systems, intake portals, mailroom\nscanners, case-management platforms. Three BPMN service tasks invoke\nthe reserved UAPF-IP capabilities ai.redact@1, ai.extract@1, event.emit@1.\nThe host fulfils each capability with its own AI agent; this package\nsupplies the BPMN flow, the VDVC output JSON Schema, the guardrails,\nand the resource mapping contract.\n", + "level": 4, + "version": "1.0.0", + "includes": [], + "dependencies": {}, + "cornerstones": { + "bpmn": true, + "dmn": false, + "cmmn": false, + "resources": true + }, + "paths": { + "bpmn": "bpmn", + "dmn": "dmn", + "cmmn": "cmmn", + "resources": "resources", + "metadata": "metadata" + }, + "exposure": { + "mcp": { + "enabled": true, + "runnable": true, + "exposedEntrypoints": [ + "Process_SemanticDocumentAnalysis" + ], + "exposedArtifacts": [ + "manifest", + "bpmn", + "docs" + ] + } + }, + "owners": [ + { + "type": "team", + "id": "uapf-stewards", + "contact": "stewards@uapf.dev" + } + ], + "lifecycle": "draft" +} \ No newline at end of file diff --git a/metadata/lifecycle.yaml b/metadata/lifecycle.yaml new file mode 100644 index 0000000..cd7ce16 --- /dev/null +++ b/metadata/lifecycle.yaml @@ -0,0 +1,9 @@ +kind: uapf.metadata.lifecycle +status: draft +created: "2026-05-15T20:30:00Z" +lastModified: "2026-05-15T20:30:00Z" +changeHistory: + - version: "1.0.0" + date: "2026-05-15" + summary: "Initial release. Reusable UAPF v1.1 Level 4 process for VDVC semantic metadata extraction. Three reserved-namespace capabilities (ai.redact@1, ai.extract@1, event.emit@1). VDVC output schema in resources/schemas/." + author: "uapf-stewards" diff --git a/metadata/ownership.yaml b/metadata/ownership.yaml new file mode 100644 index 0000000..6986f9d --- /dev/null +++ b/metadata/ownership.yaml @@ -0,0 +1,9 @@ +kind: uapf.metadata.ownership +owners: + - type: team + id: uapf-stewards + name: UAPF Stewards + contact: stewards@uapf.dev + role: owner +approvers: + - uapf-stewards diff --git a/resources/guardrails.yaml b/resources/guardrails.yaml new file mode 100644 index 0000000..c7c9365 --- /dev/null +++ b/resources/guardrails.yaml @@ -0,0 +1,32 @@ +# Non-normative supplementary file. UAPF v1.1 does NOT cornerstone guardrails; +# they live under resources/ as a host-readable policy snapshot. +authority: dev.uapf.stewards +version: "1.0.0" + +privacy: + forbidden_in_output: + - personal_name + - personal_id_number + - postal_address + - phone_number + - email_address + - bank_account + - iban + - health_record_value + - biometric_value + pii_handling: + - "Detected PII MUST be listed in sensitivityControl.detectedEntityTypes as TYPE names only, never values." + - "Set personalDataRisk according to detected types: NONE < LOW < MEDIUM < HIGH." + +eu_ai_act: + classification: "Annex III §5(a) and §8(a) — high-risk per Regulation 2024/1689" + required_transparency_fields: + - "semanticSummary.summarySource MUST be \"AI\"" + - "semanticSummary.aiConfidenceScore MUST be 0.0–1.0" + - "semanticSummary.aiModelVersion MUST be the exact model identifier" + human_oversight: "humanValidationStatus MUST be PENDING or REQUIRED on completion; consuming higher-level process MUST surface to a human before any consequential action." + +accuracy: + - "Do not fabricate fields not supported by source text." + - "Set aiConfidenceScore below 0.3 when classification is uncertain." + - "If document is unreadable or too short, set humanValidationStatus to REQUIRED." diff --git a/resources/mappings.yaml b/resources/mappings.yaml new file mode 100644 index 0000000..f210569 --- /dev/null +++ b/resources/mappings.yaml @@ -0,0 +1,54 @@ +kind: uapf.resources.mapping + +targets: + - id: agent.semantic-extractor + type: ai_agent + name: Semantic Extraction AI Agent + description: | + Host-provided AI agent that fulfils ai.redact@1, ai.extract@1, and + event.emit@1 for this process. Implementation is the host's choice + (Claude, GPT, on-prem LLM, etc.); this package supplies the BPMN + flow, the output schema, and the guardrails. + capabilities: + - capability.ai.redact + - capability.ai.extract + - capability.event.emit + +bindings: + - source: { type: bpmn.serviceTask, ref: Task_RedactPii } + targetId: agent.semantic-extractor + mode: autonomous + contract: + input: + - { name: text, type: string, required: true } + - { name: categories, type: array, required: false, description: "Optional PII categories; defaults to host policy." } + output: + - { name: redactedText, type: string } + - { name: detections, type: array } + timeout: "10s" + requiredCapabilities: [capability.ai.redact] + + - source: { type: bpmn.serviceTask, ref: Task_ExtractSemantics } + targetId: agent.semantic-extractor + mode: autonomous + contract: + input: + - { name: text, type: string, required: true, description: "Redacted text from previous task." } + - { name: schema, type: object, required: true, description: "VDVC v1.1 output schema. Reference: resources/schemas/vdvc-semantic-summary.schema.json" } + output: + - { name: extracted, type: object, description: "Validates against resources/schemas/vdvc-semantic-summary.schema.json" } + - { name: confidence, type: number } + - { name: modelUsed, type: string } + timeout: "30s" + retries: { maxAttempts: 2, backoffMs: 2000 } + requiredCapabilities: [capability.ai.extract] + + - source: { type: bpmn.serviceTask, ref: Task_EmitResultEvent } + targetId: agent.semantic-extractor + mode: autonomous + contract: + input: + - { name: eventType, type: string, required: true } + - { name: payload, type: object, required: true } + timeout: "5s" + requiredCapabilities: [capability.event.emit] diff --git a/resources/schemas/vdvc-semantic-summary.schema.json b/resources/schemas/vdvc-semantic-summary.schema.json new file mode 100644 index 0000000..7c17a92 --- /dev/null +++ b/resources/schemas/vdvc-semantic-summary.schema.json @@ -0,0 +1,49 @@ +{ + "$schema": "https://json-schema.org/draft/2020-12/schema", + "$id": "https://uapf.dev/schemas/vdvc/semantic-summary-v1.1.json", + "title": "VDVC Semantic Summary v1.1", + "description": "Output contract for ai.extract@1 when invoked by Process_SemanticDocumentAnalysis. The host's AI agent MUST produce output validating against this schema.", + "type": "object", + "required": ["semanticSummary", "sensitivityControl"], + "properties": { + "semanticSummary": { + "type": "object", + "required": ["primaryTopic", "summary", "summarySource", "aiConfidenceScore", "aiModelVersion", "humanValidationStatus"], + "properties": { + "primaryTopic": { "type": "string", "maxLength": 200 }, + "subTopics": { "type": "array", "items": { "type": "string" } }, + "summary": { "type": "string", "maxLength": 4000 }, + "documentPurpose": { "type": "string", "maxLength": 200 }, + "requestedAction": { "type": "string", "maxLength": 200 }, + "involvedPartyTypes": { "type": "array", "items": { "type": "string" }, "description": "Party TYPES only, never names." }, + "geographicScope": { "type": "string" }, + "sectorTags": { "type": "array", "items": { "type": "string" } }, + "legalDomain": { "type": "string" }, + "estimatedRiskLevel": { "enum": ["LOW", "MEDIUM", "HIGH", "CRITICAL"] }, + "urgencyLevel": { "enum": ["LOW", "NORMAL", "HIGH", "URGENT"] }, + "keywords": { "type": "array", "maxItems": 20, "items": { "type": "string" } }, + "detectedLanguage": { "type": "string", "pattern": "^[a-z]{2}$" }, + "summarySource": { "const": "AI" }, + "aiConfidenceScore": { "type": "number", "minimum": 0, "maximum": 1 }, + "aiModelVersion": { "type": "string" }, + "humanValidationStatus": { "enum": ["PENDING", "REQUIRED", "VALIDATED", "REJECTED"] }, + "mentions_child": { "type": "boolean" }, + "ongoing_harm": { "type": "boolean" }, + "vulnerable_group": { "type": "boolean" }, + "criminal_indication": { "type": "boolean" } + } + }, + "sensitivityControl": { + "type": "object", + "required": ["personalDataRisk", "allowCentralization", "redactionLevel"], + "properties": { + "personalDataRisk": { "enum": ["NONE", "LOW", "MEDIUM", "HIGH"] }, + "allowCentralization": { "type": "boolean" }, + "redactionLevel": { "enum": ["NONE", "PARTIAL", "FULL"] }, + "accessRestrictionBasis":{ "type": "string" }, + "classifiedInformation": { "type": "boolean" }, + "detectedEntityTypes": { "type": "array", "items": { "type": "string" } } + } + } + } +} diff --git a/tests/eval-set.json b/tests/eval-set.json new file mode 100644 index 0000000..1d59843 --- /dev/null +++ b/tests/eval-set.json @@ -0,0 +1,24 @@ +{ + "algorithm": "Process_SemanticDocumentAnalysis", + "package_version": "1.0.0", + "cases": [ + { + "id": "child-rights", + "input_fixture": "fixtures/child-rights-input.json", + "expected_facets": { + "mentions_child": true, + "vulnerable_group": true, + "humanValidationStatus": "PENDING" + } + }, + { + "id": "discrimination-disability", + "input_fixture": "fixtures/discrimination-input.json", + "expected_facets": { + "vulnerable_group": true, + "humanValidationStatus": "PENDING" + } + } + ], + "success_criteria": { "min_pass_rate": 0.95, "max_p95_latency_ms": 15000 } +} diff --git a/uapf.yaml b/uapf.yaml new file mode 100644 index 0000000..bc093ae --- /dev/null +++ b/uapf.yaml @@ -0,0 +1,50 @@ +kind: uapf.package +id: dev.uapf.semantic-document-analysis +name: Semantic Document Analysis (UAPF reference algorithm) +description: | + Level-4 UAPF process for extracting VDVC-conformant semantic metadata + (topic, summary, urgency, risk, sensitivity) from a free-text document. + + Portable across document management systems, intake portals, mailroom + scanners, case-management platforms. Three BPMN service tasks invoke + the reserved UAPF-IP capabilities ai.redact@1, ai.extract@1, event.emit@1. + The host fulfils each capability with its own AI agent; this package + supplies the BPMN flow, the VDVC output JSON Schema, the guardrails, + and the resource mapping contract. + +level: 4 +version: "1.0.0" + +includes: [] +dependencies: {} + +cornerstones: + bpmn: true + dmn: false + cmmn: false + resources: true + +paths: + bpmn: bpmn + dmn: dmn + cmmn: cmmn + resources: resources + metadata: metadata + +exposure: + mcp: + enabled: true + runnable: true + exposedEntrypoints: + - "Process_SemanticDocumentAnalysis" + exposedArtifacts: + - manifest + - bpmn + - docs + +owners: + - type: team + id: uapf-stewards + contact: stewards@uapf.dev + +lifecycle: draft