{
  "$schema": "https://json-schema.org/draft/2020-12/schema",
  "$id": "https://alexanarch.org/api/enrichment-protocol.json",
  "title": "Alexanarch Enrichment Pipeline Protocol",
  "protocol_name": "enrichment",
  "protocol_version": "enrichment/v1",
  "schema_version": "2026-06-22-enrichment-v1",
  "last_updated": "2026-06-22",
  "summary": "Specification of the enrichment operations that run on each deposit, both at mint-time and as ongoing reading-pass work. Defines what a fully-enriched deposit looks like and which surfaces depend on which enrichment stages.",
  "stages": {
    "at_mint_time": {
      "description": "Runs automatically inside .github/workflows/mint-axn.yml when a new deposit is minted. Each step is idempotent and surface-regenerative.",
      "steps": [
        {
          "step": "compute_canonical_bytes",
          "owner": "mint-axn.yml",
          "produces": "hash (SHA-256), axn_canonical",
          "governed_by": ["deposit", "axn"]
        },
        {
          "step": "derive_axn",
          "owner": "mint-axn.yml",
          "produces": "axn, emoji, clusters, reading",
          "governed_by": ["axn"]
        },
        {
          "step": "generate_record_page",
          "owner": "mint-axn.yml or wire_deposit.py",
          "produces": "s/records/<N>/index.html"
        },
        {
          "step": "generate_simple_md",
          "owner": "mint-axn.yml",
          "produces": "data/deposits/AXN-<NNNN>.md"
        },
        {
          "step": "regenerate_derived_surfaces",
          "owner": "scripts/regenerate_surfaces.py (invoked by mint-axn.yml)",
          "produces": "s/browse/, data/browse-index.json, data/chunks/registry/, sitemap.xml, SHA256SUMS.txt"
        },
        {
          "step": "generate_wiki_article_provisional",
          "owner": "mint-axn.yml (Anthropic API call)",
          "produces": "registry entry's wiki_article field",
          "wiki_status": "provisional-generated"
        },
        {
          "step": "extract_entities_provisional",
          "owner": "mint-axn.yml (Anthropic API call)",
          "produces": "registry entry's entities field",
          "entity_status": "provisional-generated"
        }
      ]
    },
    "at_reading_pass": {
      "description": "Manual or scripted reading-pass operations applied to one deposit at a time. Builds the curated terminological surface that the graph and wiki tabs project from.",
      "steps": [
        {
          "step": "read_deposit_text",
          "owner": "scripts/read_pass.py (with API assist) or manual",
          "produces": "concept extraction at line-level"
        },
        {
          "step": "extract_defined_concepts",
          "owner": "scripts/read_pass.py or manual",
          "produces": "defines_concepts array on the registry entry; new entries in data/entity-index.json with type ∈ {minted, specified, developed, founded, revised, positioned, extracted}"
        },
        {
          "step": "extract_entity_triples",
          "owner": "scripts/read_pass.py or manual",
          "produces": "entity_triples on each new concept in entity-index"
        },
        {
          "step": "extract_lexical_terms",
          "owner": "consolidate.py",
          "produces": "new rows in data/lexical-minting-registry.json (the broader pre-curation surface)"
        },
        {
          "step": "extract_citations",
          "owner": "scripts/citation_extractor.py",
          "produces": "new edges in data/citation-graph.json via mechanisms: axn_reference, axn_hex_reference, ea_id_reference, deposit_number_reference, doi_resolution"
        },
        {
          "step": "generate_autonomous_edition",
          "owner": "scholia_generator.py",
          "produces": "data/autonomous/AXN-<HEX>-autonomous.md (optional, not every deposit)"
        }
      ]
    },
    "at_build_time": {
      "description": "Periodic aggregation operations that rebuild derived datasets and exports from the canonical registry + enrichment surfaces.",
      "steps": [
        {"step": "build_ro_crate", "owner": "build.py"},
        {"step": "build_data_package", "owner": "build.py"},
        {"step": "build_dcat", "owner": "build.py", "produces": "build/dcat.jsonld"},
        {"step": "build_csv_export", "owner": "build.py", "produces": "build/catalog-export.csv"},
        {"step": "build_jsonld_graph", "owner": "build.py", "produces": "build/graph.jsonld"},
        {"step": "build_journal_tocs", "owner": "build.py"}
      ]
    }
  },
  "citation_extraction": {
    "canonical_implementation": "scripts/citation_extractor.py",
    "via_types": {
      "axn_reference": {
        "description": "Full canonical AXN reference like AXN:0379.GENERATIVE.⚙️🔍📜🏛️⚡🔄.",
        "pattern": "AXN:[0-9A-F]{2,4}\\.[A-Z]+\\.[^\\s\\.\\'\",;<>\\)]{1,16}",
        "resolves_via": "registry.deposits[].axn and legacy_axn and axn_history[].axn"
      },
      "axn_hex_reference": {
        "description": "Short AXN-<HEX> reference (just the hex label, no emoji).",
        "pattern": "(?<![\\w\\-])AXN-([0-9A-F]{2,4})(?![0-9A-F])",
        "resolves_via": "registry.deposits[].hex"
      },
      "ea_id_reference": {
        "description": "Sovereign EA-* document ID like EA-MPAI-DOI-IMPERMANENCE-01.",
        "pattern": "(?<![\\w\\-])EA(?:-[A-Z]+){2,4}-\\d+(?!\\d)",
        "resolves_via": "registry.deposits[].sovereign_id and document_id; supplemented by EA-* IDs discovered in titles and descriptions"
      },
      "deposit_number_reference": {
        "description": "Bare #N deposit-number reference. Requires word-boundary before the # (excludes URL fragments).",
        "pattern": "(?:^|(?<=[\\s(\\[,;—]))#(\\d{1,4})(?!\\d)",
        "resolves_via": "direct integer match against registry.deposits[].deposit_number"
      },
      "doi_resolution": {
        "description": "Legacy Zenodo DOI reference. Resolved via the dead_doi → sovereign_id → deposit_number chain in /data/doi-resolution-index.json.",
        "pattern": "10\\.5281/zenodo\\.(\\d+)",
        "resolves_via": "data/doi-resolution-index.json mappings"
      }
    },
    "invariants": [
      "Self-references (source_deposit == target_deposit) are never added.",
      "Identical (source_deposit, target_deposit, via) triples are deduplicated.",
      "Existing edges in citation-graph.json are preserved; only new triples are appended.",
      "The script is idempotent — running twice produces the same result."
    ]
  },
  "reading_pass_state_tracking": {
    "description": "How to know which deposits have been read.",
    "signal": "A deposit has been read iff at least one concept in /data/entity-index.json has its `defined_in` field equal to the deposit's deposit_number.",
    "engagement_classification": "Each entity-index concept carries an engagement_type field: minted, specified, developed, founded, revised, positioned, or unclassified. The first six are deliberate engagements (the deposit takes a definite analytical or compositional action toward the concept). 'unclassified' means the concept was extracted but not engaged.",
    "current_progress_as_of": "2026-06-22",
    "current_progress": {
      "total_deposits": 879,
      "deposits_with_extracted_concepts": 535,
      "deposits_in_range_unread": 340,
      "deliberate_engagements": 703,
      "unclassified_extractions": 6470
    }
  },
  "downstream_consumers": {
    "description": "Surfaces that depend on enrichment outputs and must be regenerated when those outputs change.",
    "consumers": [
      {"surface": "/s/graph/", "depends_on": ["entity_triples in entity-index", "citation-graph", "registry"]},
      {"surface": "/s/wiki/", "depends_on": ["registry.wiki_article", "entity-index for back-links"]},
      {"surface": "/s/records/<N>/", "depends_on": ["registry entry", "entity-index", "citation-graph"]},
      {"surface": "/data/semantic-addresses.json", "depends_on": ["entity-index (for refers_to resolution)"]},
      {"surface": "/build/graph.jsonld", "depends_on": ["entity-index", "citation-graph"]}
    ]
  },
  "change_log": [
    {
      "version": "enrichment/v1",
      "date": "2026-06-22",
      "changes": [
        "Initial formalization of the three-stage enrichment pipeline.",
        "Documented the canonical citation extraction mechanism in scripts/citation_extractor.py.",
        "Added axn_reference, axn_hex_reference, ea_id_reference, deposit_number_reference as new via types alongside the legacy doi_resolution.",
        "Documented reading-pass state and engagement classification semantics."
      ]
    }
  ]
}
