feat(referentials): validation ATIH 2018 des codes médicaux

Ajoute une couche de validation post-extraction contre les référentiels officiels de l'ATIH (Agence Technique de l'Information sur l'Hospitalisation) pour 2018. Zéro tolérance sur les codes T2A : un code invalide est flaggé, et une correction par plus proche voisin (Levenshtein ≤ 1) est proposée. Contenu : - pipeline/referentials.py : API publique is_valid_{cim10,ccam,ghm,ghs}, get_cim10_libelle, nearest_cim10, ghm_to_ghs. CLI --build/--test/--stats. - pipeline/validation.py : annote un JSON d'extraction avec un bloc `_validation` par page (codes valides/invalides + suggestions + cross- checks GHM↔GHS). - referentials/sources/ : données brutes ATIH publiques (CIM-10 ClaML 2019 substitut, CCAM v5 2018, GHM v2018, tarifs fév. 2018). - referentials/atih_2018.sqlite : base SQLite prête à l'emploi (11 623 CIM-10 · 8 147 CCAM · 2 593 GHM · 5 329 couples GHM→GHS). - tests/test_referentials.py : 11 tests unitaires (11/11 passent). - annotate_validation.py : script qui annote tous les JSONs V2 en place et produit validation_report.md. Note CIM-10 : la version 2018 ATIH n'est publiée qu'en PDF, ClaML 2019 est utilisée en substitut (écart connu ≈ 60 codes / 11 600). Gestion des suffixes PMSI : `*` (CMA exclue par le DP) et `+N` (extension PMSI) sont strippés avant validation, le code racine seul est comparé au référentiel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 15:06:01 +02:00
parent ed4d9bd765
commit 6df590ae95
17 changed files with 156052 additions and 0 deletions
--- a/annotate_validation.py
+++ b/annotate_validation.py
@@ -0,0 +1,136 @@
+"""Annote les JSONs V2 existants avec la validation ATIH.
+
+Utile pour ajouter la validation sans relancer l'extraction complète.
+Produit aussi un rapport agrégé en markdown.
+"""
+import json
+from collections import defaultdict
+from pathlib import Path
+from pipeline.validation import annotate
+
+
+OUT_DIR = Path("output/v2")
+REPORT = Path("validation_report.md")
+
+
+def annotate_all() -> list[dict]:
+    """Annote chaque JSON et écrit le résultat en place (avec _validation)."""
+    results = []
+    for p in sorted(OUT_DIR.glob("OGC *.json")):
+        data = json.loads(p.read_text(encoding="utf-8"))
+        annotated = annotate(data)
+        p.write_text(json.dumps(annotated, ensure_ascii=False, indent=2), encoding="utf-8")
+        results.append(annotated)
+        rec_v = annotated.get("extraction", {}).get("recueil", {}).get("_validation", {})
+        s = rec_v.get("summary", {})
+        cc = rec_v.get("cross_checks", {})
+        print(f"  {data['fichier']:8s} — valid={s.get('valid',0):2d}  invalid={s.get('invalid',0):2d}  "
+              f"empty={s.get('empty',0):2d}  incoherent={s.get('ghm_ghs_incoherents',0)}  "
+              f"etab={cc.get('etab',{}).get('coherent','?')} reco={cc.get('reco',{}).get('coherent','?')}")
+    return results
+
+
+def build_report(results: list[dict]):
+    """Agrégation par champ : taux de validité, suggestions les plus fréquentes."""
+    per_field = defaultdict(lambda: {"total": 0, "valid": 0, "invalid": 0, "empty": 0, "suggestions": []})
+    incoherences = []
+
+    for d in results:
+        name = d["fichier"]
+        rec_v = d.get("extraction", {}).get("recueil", {}).get("_validation", {})
+        if not rec_v:
+            continue
+
+        # Codes unitaires
+        for key in ["ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"]:
+            entry = rec_v.get(key, {})
+            st = per_field[key]
+            st["total"] += 1
+            if entry.get("valid") is True: st["valid"] += 1
+            elif entry.get("valid") is False:
+                st["invalid"] += 1
+                if "suggestion" in entry:
+                    st["suggestions"].append((name, entry["code"], entry["suggestion"]))
+            else: st["empty"] += 1
+
+        # Codage etab / reco : dp + dr + das
+        for section in ["codage_etab", "codage_reco"]:
+            sec = rec_v.get(section, {})
+            for sub in ["dp", "dr"]:
+                entry = sec.get(sub, {})
+                st = per_field[f"{section}.{sub}"]
+                st["total"] += 1
+                if entry.get("valid") is True: st["valid"] += 1
+                elif entry.get("valid") is False:
+                    st["invalid"] += 1
+                    if "suggestion" in entry:
+                        st["suggestions"].append((name, entry["code"], entry["suggestion"]))
+                else: st["empty"] += 1
+            for das in sec.get("das", []) or []:
+                st = per_field[f"{section}.das"]
+                st["total"] += 1
+                if das.get("valid") is True: st["valid"] += 1
+                elif das.get("valid") is False:
+                    st["invalid"] += 1
+                    if "suggestion" in das:
+                        st["suggestions"].append((name, das["code"], das["suggestion"]))
+                else: st["empty"] += 1
+
+        # Cohérence GHM ↔ GHS
+        for side in ["etab", "reco"]:
+            cc = rec_v.get("cross_checks", {}).get(side, {})
+            if cc.get("checked") and not cc.get("coherent"):
+                incoherences.append({
+                    "dossier": name, "side": side,
+                    "ghs_extrait": cc.get("ghs_extrait"),
+                    "ghs_possibles": cc.get("ghs_possibles"),
+                })
+
+    # Markdown report
+    lines = ["# Rapport de validation ATIH — V2 (18 dossiers)\n"]
+    lines.append("## Couverture et validité par champ\n")
+    lines.append("| Champ | Total | Valid | Invalid | Vide | Validité codes renseignés |")
+    lines.append("|---|---:|---:|---:|---:|---:|")
+    for f, st in per_field.items():
+        renseignes = st["valid"] + st["invalid"]
+        ratio = (100 * st["valid"] / renseignes) if renseignes else 0
+        lines.append(f"| `{f}` | {st['total']} | {st['valid']} | {st['invalid']} | {st['empty']} | {ratio:.0f}% |")
+
+    # Suggestions OCR
+    lines.append("\n## Corrections OCR suggérées (Levenshtein ≤ 1)")
+    lines.append("\nCodes extraits invalides mais ressemblant à un code ATIH existant :\n")
+    lines.append("| Dossier | Champ | Code extrait | Suggestion |")
+    lines.append("|---|---|---|---|")
+    sugg_count = 0
+    for field, st in per_field.items():
+        for name, code, sug in st["suggestions"]:
+            lines.append(f"| {name} | `{field}` | `{code}` | **`{sug}`** |")
+            sugg_count += 1
+    if sugg_count == 0:
+        lines.append("| — | — | — | Aucune suggestion (pas de correction Levenshtein ≤ 1) |")
+
+    # Incohérences GHM ↔ GHS
+    lines.append("\n## Incohérences GHM ↔ GHS détectées\n")
+    if incoherences:
+        lines.append("| Dossier | Côté | GHS extrait | GHS possibles pour le GHM |")
+        lines.append("|---|---|---|---|")
+        for inc in incoherences:
+            lines.append(f"| {inc['dossier']} | {inc['side']} | `{inc['ghs_extrait']}` | {inc['ghs_possibles']} |")
+    else:
+        lines.append("✓ Aucune incohérence détectée sur les GHM/GHS extraits.")
+
+    lines.append(f"\n## Synthèse\n")
+    total_codes = sum(st["valid"] + st["invalid"] for st in per_field.values())
+    total_valid = sum(st["valid"] for st in per_field.values())
+    lines.append(f"- **{total_valid}/{total_codes} codes valides** ({100*total_valid/total_codes:.1f}%)")
+    lines.append(f"- **{sugg_count} suggestions de correction OCR** trouvées automatiquement")
+    lines.append(f"- **{len(incoherences)} incohérences GHM↔GHS** sur les paires extraites")
+
+    REPORT.write_text("\n".join(lines), encoding="utf-8")
+    print(f"\nRapport → {REPORT}")
+
+
+if __name__ == "__main__":
+    print("Annotation en place des JSONs V2 + calcul validation ATIH...\n")
+    results = annotate_all()
+    build_report(results)
--- a/pipeline/referentials.py
+++ b/pipeline/referentials.py
@@ -0,0 +1,597 @@
+"""Validation des codes médicaux contre les référentiels ATIH 2018.
+
+Ce module charge les référentiels officiels ATIH (CIM-10, CCAM, GHM, table
+GHM→GHS) dans une base SQLite locale et expose des fonctions de validation
+pour les codes extraits par le pipeline OCR.
+
+Sources téléchargées (voir `referentials/sources/`) :
+- **CIM-10 FR 2019** au format ClaML XML (ATIH) — utilisée comme substitut
+  à la CIM-10 2018 : ATIH ne publie officiellement la CIM-10 2018 qu'en PDF.
+  L'écart entre CIM-10 2019 et CIM-10 2018 est < 100 codes sur ~11 600 ;
+  un écart acceptable pour une validation OCR (et qui peut introduire
+  quelques faux positifs pour des codes créés en 2019, mais jamais de faux
+  négatifs sur un code 2018 valide).
+- **CCAM descriptive à usage PMSI 2018 V5** (XLSX ATIH).
+- **GHM V2018** (XLSX ATIH, fichier `regroupement_ghm_v2018.xlsx`).
+- **Arrêté tarifaire MCO Février 2018** (XLSX ATIH, feuilles "Tarifs public"
+  et "Tarifs privé") pour la table GHM→GHS.
+
+Formats de codes supportés :
+- CIM-10 : lettre + 2 à 5 chiffres (ex: K650, T814, sans point).
+- CCAM   : 4 lettres + 3 chiffres (ex: EBFA012).
+- GHM    : 2 chiffres + lettre + 3 chiffres (ex: 11M122).
+- GHS    : nombre 1-5 chiffres (ex: 4323).
+
+Utilisation :
+    from pipeline.referentials import (
+        is_valid_cim10, is_valid_ccam, is_valid_ghm, is_valid_ghs,
+        nearest_cim10, ghm_to_ghs, get_cim10_libelle,
+    )
+    if not is_valid_cim10("K650"):
+        suggestion = nearest_cim10("K65O")  # correction O → 0
+
+Build initial de la base : ``python -m pipeline.referentials --build``
+Test rapide : ``python -m pipeline.referentials --test``
+"""
+from __future__ import annotations
+
+import argparse
+import gzip
+import json
+import re
+import sqlite3
+import sys
+import xml.etree.ElementTree as ET
+from functools import lru_cache
+from pathlib import Path
+from typing import Iterable
+
+try:
+    from rapidfuzz.distance import Levenshtein as _Lev
+    _HAS_RAPIDFUZZ = True
+except ImportError:  # pragma: no cover - fallback pur Python
+    _HAS_RAPIDFUZZ = False
+
+_ROOT = Path(__file__).resolve().parent.parent
+REFERENTIALS_DIR = _ROOT / "referentials"
+SOURCES_DIR = REFERENTIALS_DIR / "sources"
+DB_PATH = REFERENTIALS_DIR / "atih_2018.sqlite"
+
+# Formats attendus (utilisés pour normaliser l'entrée avant recherche DB)
+_RE_CIM10 = re.compile(r"^[A-Z][0-9]{2,5}$")
+_RE_CCAM = re.compile(r"^[A-Z]{4}[0-9]{3}$")
+_RE_GHM = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
+_RE_GHS = re.compile(r"^[0-9]{1,5}$")
+
+
+# ---------------------------------------------------------------------------
+# Normalisation des entrées (tolérante aux bruits OCR courants)
+# ---------------------------------------------------------------------------
+
+def _normalize_cim10(code: str) -> str:
+    """Normalise un code CIM-10 extrait pour comparaison au référentiel.
+
+    Gère :
+    - Point décimal optionnel : "K65.0" → "K650"
+    - Espaces / casse : " k650 " → "K650"
+    - Suffixes PMSI : "C795 *" → "C795" (le `*` signifie "CMA exclue par le DP")
+      et "K635+0" → "K635" (le `+N` est une extension PMSI à valider séparément)
+    - Suffixe de position numérique éventuellement collé : "K650+" → "K650"
+    """
+    if not code:
+        return ""
+    s = code.strip().upper()
+    # Couper à la première occurrence d'un marqueur PMSI non-alphanum
+    # (*, +, #, espace suivi d'un marqueur). On garde uniquement la tête du code.
+    for sep in ("*", "+", "#"):
+        if sep in s:
+            s = s.split(sep, 1)[0]
+    return s.replace(".", "").replace(" ", "").strip()
+
+
+def _normalize_ccam(code: str) -> str:
+    if not code:
+        return ""
+    # Retire éventuelle extension PMSI (-1, -2…) et les espaces
+    base = code.split("-")[0]
+    return base.replace(" ", "").strip().upper()
+
+
+def _normalize_ghm(code: str) -> str:
+    if not code:
+        return ""
+    return code.replace(" ", "").strip().upper()
+
+
+def _normalize_ghs(code: str) -> str:
+    if not code:
+        return ""
+    # Les GHS peuvent arriver en "0023" ou "23"
+    s = re.sub(r"[^0-9]", "", code).lstrip("0")
+    return s or "0"
+
+
+# ---------------------------------------------------------------------------
+# Construction de la base SQLite depuis les sources téléchargées
+# ---------------------------------------------------------------------------
+
+def _create_schema(conn: sqlite3.Connection) -> None:
+    conn.executescript(
+        """
+        DROP TABLE IF EXISTS cim10;
+        DROP TABLE IF EXISTS ccam;
+        DROP TABLE IF EXISTS ghm;
+        DROP TABLE IF EXISTS ghm_ghs;
+        DROP TABLE IF EXISTS metadata;
+
+        CREATE TABLE cim10 (
+            code    TEXT PRIMARY KEY,
+            libelle TEXT
+        );
+        CREATE TABLE ccam (
+            code    TEXT PRIMARY KEY,
+            libelle TEXT
+        );
+        CREATE TABLE ghm (
+            code    TEXT PRIMARY KEY,
+            libelle TEXT,
+            aso     TEXT,
+            da      TEXT
+        );
+        CREATE TABLE ghm_ghs (
+            ghm     TEXT,
+            ghs     TEXT,
+            secteur TEXT,            -- 'public' ou 'prive'
+            libelle TEXT,
+            tarif   REAL,
+            PRIMARY KEY (ghm, ghs, secteur)
+        );
+        CREATE INDEX idx_ghm_ghs_ghm ON ghm_ghs(ghm);
+        CREATE INDEX idx_ghm_ghs_ghs ON ghm_ghs(ghs);
+
+        CREATE TABLE metadata (
+            key   TEXT PRIMARY KEY,
+            value TEXT
+        );
+        """
+    )
+
+
+def _load_cim10(conn: sqlite3.Connection) -> int:
+    """Charge la CIM-10 FR depuis le ClaML XML (catégories uniquement)."""
+    xml_path = SOURCES_DIR / "cim10_claml_2019_extracted" / "cim10_claml_2019.xml"
+    if not xml_path.exists():
+        # Fallback : chercher n'importe quel xml dans extracted
+        xmls = list((SOURCES_DIR / "cim10_claml_2019_extracted").glob("*.xml"))
+        if not xmls:
+            raise FileNotFoundError(
+                f"CIM-10 ClaML introuvable dans {SOURCES_DIR}. "
+                f"Assurez-vous d'avoir téléchargé et extrait le zip ATIH."
+            )
+        xml_path = xmls[0]
+
+    tree = ET.parse(xml_path)
+    root = tree.getroot()
+    rows: list[tuple[str, str]] = []
+    for cls in root.findall(".//Class"):
+        kind = cls.get("kind")
+        if kind != "category":
+            continue
+        raw_code = cls.get("code") or ""
+        code = raw_code.replace(".", "").upper().strip()
+        if not code:
+            continue
+        pref = cls.find('.//Rubric[@kind="preferred"]/Label')
+        libelle = pref.text.strip() if (pref is not None and pref.text) else ""
+        rows.append((code, libelle))
+
+    conn.executemany(
+        "INSERT OR REPLACE INTO cim10 (code, libelle) VALUES (?, ?)", rows
+    )
+    return len(rows)
+
+
+def _load_ccam(conn: sqlite3.Connection) -> int:
+    """Charge la CCAM 2018 depuis le XLSX ATIH (feuilles CCAM_Final_2018_*)."""
+    import openpyxl
+
+    xlsx_path = SOURCES_DIR / "ccam_2018_v5.xlsx"
+    if not xlsx_path.exists():
+        raise FileNotFoundError(f"CCAM XLSX introuvable : {xlsx_path}")
+
+    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
+    pat = re.compile(r"^[A-Z]{4}[0-9]{3}$")
+    seen: dict[str, str] = {}
+    for sheet_name in wb.sheetnames:
+        if not sheet_name.startswith("CCAM_Final_2018"):
+            continue
+        ws = wb[sheet_name]
+        cur_code: str | None = None
+        for row in ws.iter_rows(values_only=True):
+            # col 0 : parfois un code, col 3 : texte / libellé
+            col0 = row[0] if len(row) > 0 else None
+            col3 = row[3] if len(row) > 3 else None
+            if isinstance(col0, str):
+                c = col0.strip()
+                if pat.match(c):
+                    cur_code = c
+                    if c not in seen:
+                        seen[c] = ""
+            if cur_code and isinstance(col3, str) and col3.strip():
+                if not seen.get(cur_code):
+                    seen[cur_code] = col3.strip()[:500]
+
+    rows = list(seen.items())
+    conn.executemany(
+        "INSERT OR REPLACE INTO ccam (code, libelle) VALUES (?, ?)", rows
+    )
+    return len(rows)
+
+
+def _load_ghm(conn: sqlite3.Connection) -> int:
+    """Charge les GHM V2018 depuis regroupement_ghm_v2018.xlsx."""
+    import openpyxl
+
+    xlsx_path = SOURCES_DIR / "regroupement_ghm_v2018.xlsx"
+    if not xlsx_path.exists():
+        raise FileNotFoundError(f"GHM XLSX introuvable : {xlsx_path}")
+
+    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
+    ws = wb[wb.sheetnames[0]]
+    ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
+    rows: list[tuple[str, str, str, str]] = []
+    header_found = False
+    for row in ws.iter_rows(values_only=True):
+        if not header_found:
+            if row and row[0] == "GHM":
+                header_found = True
+            continue
+        code = row[0]
+        if not isinstance(code, str):
+            continue
+        code = code.strip().upper()
+        if not ghm_pat.match(code):
+            continue
+        libelle = (row[1] or "").strip() if isinstance(row[1], str) else ""
+        aso = (row[2] or "").strip() if isinstance(row[2], str) else ""
+        da = (row[3] or "").strip() if isinstance(row[3], str) else ""
+        rows.append((code, libelle, aso, da))
+
+    conn.executemany(
+        "INSERT OR REPLACE INTO ghm (code, libelle, aso, da) VALUES (?, ?, ?, ?)",
+        rows,
+    )
+    return len(rows)
+
+
+def _load_ghm_ghs(conn: sqlite3.Connection) -> int:
+    """Charge la table GHM→GHS depuis tarif_arrete_fev_2018.xlsx.
+
+    Feuilles "Tarifs public" (secteur='public') et "Tarifs privé"
+    (secteur='prive'). Chaque ligne = un couple (GHS, GHM, libellé, tarif).
+    """
+    import openpyxl
+
+    xlsx_path = SOURCES_DIR / "tarif_arrete_fev_2018.xlsx"
+    if not xlsx_path.exists():
+        raise FileNotFoundError(f"Tarifs XLSX introuvable : {xlsx_path}")
+
+    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
+    ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
+    all_rows: list[tuple[str, str, str, str, float | None]] = []
+    for sheet_name, secteur in [("Tarifs public", "public"), ("Tarifs privé", "prive")]:
+        if sheet_name not in wb.sheetnames:
+            continue
+        ws = wb[sheet_name]
+        header_found = False
+        for row in ws.iter_rows(values_only=True):
+            if not header_found:
+                if row and isinstance(row[0], str) and row[0].strip().upper() == "GHS":
+                    header_found = True
+                continue
+            ghs_raw = row[0]
+            ghm_raw = row[1] if len(row) > 1 else None
+            lib_raw = row[2] if len(row) > 2 else None
+            tarif_raw = row[5] if len(row) > 5 else None
+            if ghs_raw is None or ghm_raw is None:
+                continue
+            try:
+                ghs = str(int(float(ghs_raw)))
+            except (ValueError, TypeError):
+                continue
+            ghm = str(ghm_raw).strip().upper()
+            if not ghm_pat.match(ghm):
+                continue
+            libelle = str(lib_raw).strip() if lib_raw else ""
+            try:
+                tarif = float(tarif_raw) if tarif_raw is not None else None
+            except (ValueError, TypeError):
+                tarif = None
+            all_rows.append((ghm, ghs, secteur, libelle, tarif))
+
+    conn.executemany(
+        "INSERT OR REPLACE INTO ghm_ghs (ghm, ghs, secteur, libelle, tarif) "
+        "VALUES (?, ?, ?, ?, ?)",
+        all_rows,
+    )
+    return len(all_rows)
+
+
+def build_database(db_path: Path = DB_PATH, verbose: bool = True) -> dict[str, int]:
+    """Construit la base SQLite à partir des sources.
+
+    Retourne les counts par table. Idempotent : DROP + CREATE + INSERT.
+    """
+    REFERENTIALS_DIR.mkdir(parents=True, exist_ok=True)
+    conn = sqlite3.connect(db_path)
+    try:
+        _create_schema(conn)
+        n_cim10 = _load_cim10(conn)
+        if verbose:
+            print(f"  CIM-10   : {n_cim10} codes chargés")
+        n_ccam = _load_ccam(conn)
+        if verbose:
+            print(f"  CCAM     : {n_ccam} codes chargés")
+        n_ghm = _load_ghm(conn)
+        if verbose:
+            print(f"  GHM      : {n_ghm} codes chargés")
+        n_ghs = _load_ghm_ghs(conn)
+        if verbose:
+            print(f"  GHM→GHS  : {n_ghs} lignes (public+privé)")
+
+        conn.executemany(
+            "INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
+            [
+                ("source_cim10", "ATIH CIM-10 FR 2019 ClaML (substitut 2018)"),
+                ("source_ccam", "ATIH CCAM descriptive à usage PMSI 2018 V5"),
+                ("source_ghm", "ATIH regroupement_ghm_v2018.xlsx"),
+                ("source_ghm_ghs", "ATIH tarif_arrete_fev_2018.xlsx"),
+                ("n_cim10", str(n_cim10)),
+                ("n_ccam", str(n_ccam)),
+                ("n_ghm", str(n_ghm)),
+                ("n_ghm_ghs", str(n_ghs)),
+            ],
+        )
+        conn.commit()
+        return {
+            "cim10": n_cim10,
+            "ccam": n_ccam,
+            "ghm": n_ghm,
+            "ghm_ghs": n_ghs,
+        }
+    finally:
+        conn.close()
+
+
+# ---------------------------------------------------------------------------
+# Accès à la base (connexion cachée au niveau du module)
+# ---------------------------------------------------------------------------
+
+_CONN: sqlite3.Connection | None = None
+
+
+def _get_conn() -> sqlite3.Connection:
+    global _CONN
+    if _CONN is not None:
+        return _CONN
+    if not DB_PATH.exists():
+        raise FileNotFoundError(
+            f"Base SQLite introuvable : {DB_PATH}. "
+            "Lancez d'abord : python -m pipeline.referentials --build"
+        )
+    _CONN = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, check_same_thread=False)
+    return _CONN
+
+
+# ---------------------------------------------------------------------------
+# API publique de validation
+# ---------------------------------------------------------------------------
+
+@lru_cache(maxsize=8192)
+def is_valid_cim10(code: str) -> bool:
+    """Vérifie qu'un code CIM-10 existe dans le référentiel 2018 (substitut 2019)."""
+    norm = _normalize_cim10(code)
+    if not norm or not _RE_CIM10.match(norm):
+        return False
+    cur = _get_conn().execute("SELECT 1 FROM cim10 WHERE code = ? LIMIT 1", (norm,))
+    return cur.fetchone() is not None
+
+
+@lru_cache(maxsize=8192)
+def is_valid_ccam(code: str) -> bool:
+    """Vérifie qu'un code CCAM existe dans la CCAM PMSI 2018."""
+    norm = _normalize_ccam(code)
+    if not norm or not _RE_CCAM.match(norm):
+        return False
+    cur = _get_conn().execute("SELECT 1 FROM ccam WHERE code = ? LIMIT 1", (norm,))
+    return cur.fetchone() is not None
+
+
+@lru_cache(maxsize=8192)
+def is_valid_ghm(code: str) -> bool:
+    """Vérifie qu'un code GHM existe dans la V2018."""
+    norm = _normalize_ghm(code)
+    if not norm or not _RE_GHM.match(norm):
+        return False
+    cur = _get_conn().execute("SELECT 1 FROM ghm WHERE code = ? LIMIT 1", (norm,))
+    return cur.fetchone() is not None
+
+
+@lru_cache(maxsize=8192)
+def is_valid_ghs(code: str) -> bool:
+    """Vérifie qu'un code GHS existe dans l'arrêté tarifaire 2018."""
+    norm = _normalize_ghs(code)
+    if not norm or not _RE_GHS.match(norm):
+        return False
+    cur = _get_conn().execute(
+        "SELECT 1 FROM ghm_ghs WHERE ghs = ? LIMIT 1", (norm,)
+    )
+    return cur.fetchone() is not None
+
+
+@lru_cache(maxsize=4096)
+def get_cim10_libelle(code: str) -> str | None:
+    """Renvoie le libellé officiel du code CIM-10, ou None."""
+    norm = _normalize_cim10(code)
+    if not norm:
+        return None
+    cur = _get_conn().execute(
+        "SELECT libelle FROM cim10 WHERE code = ? LIMIT 1", (norm,)
+    )
+    row = cur.fetchone()
+    return row[0] if row else None
+
+
+def ghm_to_ghs(ghm: str) -> list[str]:
+    """Renvoie les GHS possibles pour un GHM donné (publics et privés fusionnés).
+
+    Utilisé pour vérifier la cohérence du couple (GHM, GHS) extrait.
+    """
+    norm = _normalize_ghm(ghm)
+    if not norm:
+        return []
+    cur = _get_conn().execute(
+        "SELECT DISTINCT ghs FROM ghm_ghs WHERE ghm = ?", (norm,)
+    )
+    return [r[0] for r in cur.fetchall()]
+
+
+def _levenshtein(a: str, b: str) -> int:
+    if _HAS_RAPIDFUZZ:
+        return _Lev.distance(a, b)
+    # Fallback pur Python (O(n*m)) — suffisant pour des codes courts
+    if len(a) < len(b):
+        a, b = b, a
+    if not b:
+        return len(a)
+    prev = list(range(len(b) + 1))
+    for i, ca in enumerate(a, 1):
+        cur = [i]
+        for j, cb in enumerate(b, 1):
+            ins = cur[j - 1] + 1
+            dele = prev[j] + 1
+            sub = prev[j - 1] + (ca != cb)
+            cur.append(min(ins, dele, sub))
+        prev = cur
+    return prev[-1]
+
+
+def nearest_cim10(code: str, max_distance: int = 1) -> str | None:
+    """Trouve le code CIM-10 valide le plus proche (distance de Levenshtein).
+
+    Utile pour corriger les erreurs OCR courantes (O/0, I/1, B/8…).
+    Stratégie de départage en cas d'égalité de distance :
+      1. Privilégie un candidat de même longueur (substitution >> suppression)
+      2. Sinon tri lexicographique croissant.
+    Retourne None si aucun code n'est à ≤ max_distance.
+    """
+    norm = _normalize_cim10(code)
+    if not norm:
+        return None
+    if is_valid_cim10(norm):
+        return norm
+    conn = _get_conn()
+    length = len(norm)
+    cur = conn.execute(
+        "SELECT code FROM cim10 WHERE length(code) BETWEEN ? AND ?",
+        (length - max_distance, length + max_distance),
+    )
+    candidates: list[tuple[int, int, str]] = []  # (distance, |len_diff|, code)
+    for (cand,) in cur:
+        d = _levenshtein(norm, cand)
+        if d <= max_distance:
+            candidates.append((d, abs(len(cand) - length), cand))
+    if not candidates:
+        return None
+    # Tri : distance min, puis longueur la plus proche, puis lexicographique
+    candidates.sort(key=lambda t: (t[0], t[1], t[2]))
+    return candidates[0][2]
+
+
+# ---------------------------------------------------------------------------
+# Tests légers (exécutables sans pytest)
+# ---------------------------------------------------------------------------
+
+def _run_selftest() -> int:
+    """Tests de fumée rapides. Retourne le nombre d'échecs."""
+    failures = 0
+
+    def check(label: str, cond: bool, detail: str = "") -> None:
+        nonlocal failures
+        status = "OK " if cond else "FAIL"
+        print(f"  [{status}] {label}{(' — ' + detail) if detail else ''}")
+        if not cond:
+            failures += 1
+
+    print("=== Tests référentiels ATIH 2018 ===")
+
+    # CIM-10
+    check("CIM-10 K650 valide (péritonite)", is_valid_cim10("K650"))
+    check("CIM-10 K65.0 (avec point) valide", is_valid_cim10("K65.0"))
+    check("CIM-10 T814 valide", is_valid_cim10("T814"))
+    check("CIM-10 ZZZ99 invalide", not is_valid_cim10("ZZZ99"))
+    check("CIM-10 libellé K650", get_cim10_libelle("K650") is not None,
+          detail=str(get_cim10_libelle("K650")))
+    # Correction OCR : K65O (lettre O) → K650
+    suggestion = nearest_cim10("K65O")
+    check("CIM-10 nearest(K65O) = K650", suggestion == "K650",
+          detail=f"got={suggestion}")
+
+    # CCAM
+    check("CCAM EBFA012 valide", is_valid_ccam("EBFA012"))
+    check("CCAM EBFA012-1 (ext PMSI) valide", is_valid_ccam("EBFA012-1"))
+    check("CCAM AAAA000 invalide", not is_valid_ccam("AAAA000"))
+
+    # GHM
+    check("GHM 01C031 valide", is_valid_ghm("01C031"))
+    check("GHM 99Z99Z invalide", not is_valid_ghm("99Z99Z"))
+
+    # GHS
+    check("GHS 22 valide", is_valid_ghs("22"))
+    check("GHS 99999 invalide", not is_valid_ghs("99999"))
+
+    # GHM→GHS
+    ghs_list = ghm_to_ghs("01C031")
+    check("GHM 01C031 → GHS inclut 22", "22" in ghs_list,
+          detail=f"ghs_list={ghs_list}")
+
+    # Format invalide (robustesse)
+    check("is_valid_cim10('') = False", not is_valid_cim10(""))
+    check("is_valid_ccam(None cast) = False", not is_valid_ccam(""))
+
+    print(f"=== Résultat : {failures} échec(s) ===")
+    return failures
+
+
+def _cli() -> int:
+    parser = argparse.ArgumentParser(description="Référentiels ATIH 2018")
+    g = parser.add_mutually_exclusive_group(required=True)
+    g.add_argument("--build", action="store_true",
+                   help="(Re)construit la base SQLite depuis referentials/sources/")
+    g.add_argument("--test", action="store_true",
+                   help="Exécute les tests de fumée")
+    g.add_argument("--stats", action="store_true",
+                   help="Affiche les comptages de la base")
+    args = parser.parse_args()
+
+    if args.build:
+        print(f"Construction de {DB_PATH} depuis {SOURCES_DIR}...")
+        counts = build_database()
+        print("OK :", counts)
+        return 0
+    if args.test:
+        return 1 if _run_selftest() > 0 else 0
+    if args.stats:
+        conn = _get_conn()
+        for tbl in ("cim10", "ccam", "ghm", "ghm_ghs"):
+            n = conn.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0]
+            print(f"  {tbl:10s}: {n}")
+        print("Metadata :")
+        for k, v in conn.execute("SELECT key, value FROM metadata"):
+            print(f"  {k}: {v}")
+        return 0
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(_cli())
--- a/pipeline/validation.py
+++ b/pipeline/validation.py
@@ -0,0 +1,217 @@
+"""Validation ATIH des codes extraits.
+
+Prend un JSON d'extraction produit par `pipeline/extract.py` et l'enrichit
+d'une section `_validation` par champ de code médical (CIM-10, CCAM, GHM, GHS)
+avec :
+
+- `valid`  : le code existe dans le référentiel ATIH 2018
+- `suggestion` : si invalide, code le plus proche par Levenshtein ≤ 1 (CIM-10)
+- `libelle_ref` : libellé officiel ATIH (CIM-10) pour audit
+
+Plus des cross-checks (GHS ∈ ghm_to_ghs(GHM)) pour détecter des incohérences
+de groupage.
+
+Principes :
+- Lecture seule sur le JSON source — on produit une COPIE enrichie.
+- Ne supprime / ne corrige RIEN automatiquement ; seule une suggestion est
+  annotée. La correction reste à la discrétion d'un humain (overlay) ou d'un
+  prochain pass automatique.
+"""
+from __future__ import annotations
+
+from copy import deepcopy
+from typing import Any
+
+from .referentials import (
+    get_cim10_libelle,
+    ghm_to_ghs,
+    is_valid_ccam,
+    is_valid_cim10,
+    is_valid_ghm,
+    is_valid_ghs,
+    nearest_cim10,
+)
+
+
+# ============================================================
+# Helpers
+# ============================================================
+
+def _check_cim10(code: str) -> dict:
+    """Valide un code CIM-10 et suggère une correction si invalide."""
+    code = (code or "").strip()
+    if not code:
+        return {"code": "", "valid": None}
+    valid = is_valid_cim10(code)
+    entry = {"code": code, "valid": valid}
+    if valid:
+        entry["libelle_ref"] = get_cim10_libelle(code)
+    else:
+        sug = nearest_cim10(code, max_distance=1)
+        if sug:
+            entry["suggestion"] = sug
+            entry["suggestion_libelle"] = get_cim10_libelle(sug)
+    return entry
+
+
+def _check_ccam(code: str) -> dict:
+    code = (code or "").strip()
+    if not code:
+        return {"code": "", "valid": None}
+    return {"code": code, "valid": is_valid_ccam(code)}
+
+
+def _check_ghm(code: str) -> dict:
+    code = (code or "").strip()
+    if not code:
+        return {"code": "", "valid": None}
+    entry = {"code": code, "valid": is_valid_ghm(code)}
+    if entry["valid"]:
+        entry["ghs_possibles"] = ghm_to_ghs(code)
+    return entry
+
+
+def _check_ghs(code: str) -> dict:
+    code = (code or "").strip()
+    if not code:
+        return {"code": "", "valid": None}
+    return {"code": code, "valid": is_valid_ghs(code)}
+
+
+# ============================================================
+# Validation d'un bloc codage (etab ou reco)
+# ============================================================
+
+def _validate_codage(codage: dict) -> dict:
+    """Valide un bloc codage_etab ou codage_reco."""
+    if not isinstance(codage, dict):
+        return {}
+    out = {
+        "dp": _check_cim10(codage.get("dp", "")),
+        "dr": _check_cim10(codage.get("dr", "")),
+    }
+    das_list = codage.get("das") or []
+    if isinstance(das_list, list):
+        out["das"] = [_check_cim10(d.get("code", "")) if isinstance(d, dict) else _check_cim10(str(d))
+                      for d in das_list]
+    return out
+
+
+def _validate_actes(actes: Any) -> list[dict]:
+    if not isinstance(actes, list):
+        return []
+    return [_check_ccam(a.get("code", "")) if isinstance(a, dict) else _check_ccam(str(a))
+            for a in actes]
+
+
+# ============================================================
+# Cross-checks GHM ↔ GHS
+# ============================================================
+
+def _cross_check_ghm_ghs(ghm: str, ghs: str) -> dict:
+    """Vérifie qu'un GHS observé est listé parmi les GHS possibles du GHM."""
+    ghm = (ghm or "").strip()
+    ghs = (ghs or "").strip()
+    if not ghm or not ghs:
+        return {"checked": False, "reason": "ghm ou ghs manquant"}
+    if not is_valid_ghm(ghm):
+        return {"checked": False, "reason": "GHM invalide"}
+    possibles = ghm_to_ghs(ghm)
+    # Normalisation simple : on compare la fin (au cas où l'un est tronqué)
+    ok = ghs in possibles or any(p.endswith(ghs) or ghs.endswith(p) for p in possibles)
+    return {
+        "checked": True,
+        "coherent": ok,
+        "ghs_extrait": ghs,
+        "ghs_possibles": possibles,
+    }
+
+
+# ============================================================
+# Point d'entrée
+# ============================================================
+
+def validate_recueil(recueil: dict) -> dict:
+    """Retourne un dict résumé des validations pour la page recueil."""
+    v = {
+        "codage_etab": _validate_codage(recueil.get("codage_etab", {})),
+        "codage_reco": _validate_codage(recueil.get("codage_reco", {})),
+        "actes_etab": _validate_actes(recueil.get("actes_etab", [])),
+        "actes_reco": _validate_actes(recueil.get("actes_reco", [])),
+        "ghm_etab": _check_ghm(recueil.get("ghm_etab", "")),
+        "ghs_etab": _check_ghs(recueil.get("ghs_etab", "")),
+        "ghm_reco": _check_ghm(recueil.get("ghm_reco", "")),
+        "ghs_reco": _check_ghs(recueil.get("ghs_reco", "")),
+        "cross_checks": {
+            "etab": _cross_check_ghm_ghs(
+                recueil.get("ghm_etab", ""), recueil.get("ghs_etab", "")),
+            "reco": _cross_check_ghm_ghs(
+                recueil.get("ghm_reco", ""), recueil.get("ghs_reco", "")),
+        },
+    }
+    v["summary"] = _summarize(v)
+    return v
+
+
+def _summarize(validation: dict) -> dict:
+    """Compte les codes valides / invalides dans une section _validation."""
+    valid, invalid, empty = 0, 0, 0
+
+    def _count_entry(e):
+        nonlocal valid, invalid, empty
+        if e.get("valid") is True: valid += 1
+        elif e.get("valid") is False: invalid += 1
+        else: empty += 1
+
+    for section in ("codage_etab", "codage_reco"):
+        sec = validation.get(section, {}) or {}
+        _count_entry(sec.get("dp", {}))
+        _count_entry(sec.get("dr", {}))
+        for d in sec.get("das", []) or []:
+            _count_entry(d)
+    for actes_key in ("actes_etab", "actes_reco"):
+        for a in validation.get(actes_key, []) or []:
+            _count_entry(a)
+    for g in ("ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"):
+        _count_entry(validation.get(g, {}))
+
+    cc = validation.get("cross_checks", {})
+    incoherent = sum(1 for v in cc.values() if v.get("checked") and not v.get("coherent"))
+    return {
+        "valid": valid, "invalid": invalid, "empty": empty,
+        "total_codes": valid + invalid,
+        "ghm_ghs_incoherents": incoherent,
+    }
+
+
+def annotate(extraction: dict) -> dict:
+    """Annote un JSON d'extraction complet avec validation ATIH.
+
+    Retourne une COPIE enrichie d'un bloc `_validation` à la racine de chaque
+    page structurée. N'efface / ne corrige aucune valeur.
+    """
+    out = deepcopy(extraction)
+    ext = out.get("extraction") or {}
+    if "recueil" in ext and isinstance(ext["recueil"], dict):
+        ext["recueil"]["_validation"] = validate_recueil(ext["recueil"])
+    # Concertation 2 : valider les 3 GHS
+    if "concertation_2" in ext and isinstance(ext["concertation_2"], dict):
+        c2 = ext["concertation_2"]
+        c2["_validation"] = {
+            "ghs_initial":            _check_ghs(c2.get("ghs_initial", "")),
+            "ghs_avant_concertation": _check_ghs(c2.get("ghs_avant_concertation", "")),
+            "ghs_final":              _check_ghs(c2.get("ghs_final", "")),
+        }
+    return out
+
+
+if __name__ == "__main__":
+    # Test rapide sur OGC 7
+    import json, sys
+    path = sys.argv[1] if len(sys.argv) > 1 else "output/v2/OGC 7.json"
+    with open(path) as f:
+        data = json.load(f)
+    annotated = annotate(data)
+    rec_v = annotated["extraction"]["recueil"]["_validation"]
+    print(json.dumps(rec_v["summary"], indent=2))
+    print("\ncross_checks:", json.dumps(rec_v["cross_checks"], indent=2, ensure_ascii=False))
--- a/referentials/atih_2018.sqlite
+++ b/referentials/atih_2018.sqlite
--- a/referentials/sources/ccam_2018_v5.xlsx
+++ b/referentials/sources/ccam_2018_v5.xlsx
--- a/referentials/sources/cim.json.gz
+++ b/referentials/sources/cim.json.gz
--- a/referentials/sources/cim10_claml_2019.zip
+++ b/referentials/sources/cim10_claml_2019.zip
--- a/referentials/sources/cim10_claml_2019_extracted/ClaML.dtd
+++ b/referentials/sources/cim10_claml_2019_extracted/ClaML.dtd
@@ -0,0 +1,283 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!ENTITY % rubric.simple "#PCDATA | Reference | Term">
+<!ENTITY % rubric.complex "%rubric.simple; | Para | Include | 
+ 	IncludeDescendants| Fragment | List | Table">
+
+<!ELEMENT ClaML (
+	Meta*,
+	Identifier*,
+	Title,
+	Authors?,
+	Variants?,
+	ClassKinds,
+UsageKinds?,
+	RubricKinds,
+	Modifier*,
+	ModifierClass*,
+	Class*)
+>
+<!ATTLIST ClaML
+	version CDATA #REQUIRED
+>
+
+<!ELEMENT Variants (Variant+)>
+<!ELEMENT Variant (#PCDATA)>
+<!ATTLIST Variant
+	name ID #REQUIRED
+>
+
+<!ELEMENT Meta EMPTY>
+<!ATTLIST Meta
+	name CDATA #REQUIRED
+	value CDATA #REQUIRED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT Identifier EMPTY>
+<!ATTLIST Identifier
+	authority NMTOKEN #IMPLIED
+	uid CDATA #REQUIRED
+>
+
+<!ELEMENT Title (#PCDATA)>
+<!ATTLIST Title
+	name NMTOKEN #REQUIRED
+	version CDATA #IMPLIED
+	date CDATA #IMPLIED
+>
+
+<!ELEMENT Authors (Author* )>
+<!ELEMENT Author (#PCDATA)>
+<!ATTLIST Author
+	name ID #REQUIRED
+>
+
+<!ELEMENT ClassKinds (ClassKind+)>
+<!ELEMENT RubricKinds (RubricKind+)>
+<!ELEMENT UsageKinds (UsageKind+)>
+
+<!ELEMENT ClassKind (Display*)>
+<!ATTLIST ClassKind
+	name ID #REQUIRED
+>
+
+<!ELEMENT RubricKind (Display*)>
+<!ATTLIST RubricKind
+	name ID #REQUIRED
+	inherited (true|false) "true"
+>
+
+<!ELEMENT UsageKind EMPTY>
+<!ATTLIST UsageKind
+	name ID #REQUIRED
+	mark CDATA #REQUIRED
+>
+
+<!ELEMENT Display (#PCDATA)>
+<!ATTLIST Display
+	xml:lang NMTOKEN #REQUIRED
+	variants IDREF #IMPLIED
+>
+
+<!ELEMENT Modifier (
+	Meta*,
+	SubClass*,
+	Rubric*,
+History*)
+>
+<!ATTLIST Modifier
+	code NMTOKEN #REQUIRED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT ModifierClass (
+	Meta*,
+	SuperClass,
+	SubClass*,
+	Rubric*,
+History*)
+>
+<!ATTLIST ModifierClass
+	modifier NMTOKEN #REQUIRED
+	code NMTOKEN #REQUIRED
+	usage IDREF #IMPLIED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT Class (
+	Meta*,
+	SuperClass*,
+	SubClass*,
+	ModifiedBy*,
+	ExcludeModifier*,
+	Rubric*,
+History*)
+>
+<!ATTLIST Class
+	code CDATA #REQUIRED
+	kind IDREF #REQUIRED
+	usage IDREF #IMPLIED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT ModifiedBy (
+	Meta*,
+ValidModifierClass*)
+>
+<!ATTLIST ModifiedBy
+	code NMTOKEN #REQUIRED
+	all (true|false) "true"
+	position CDATA #IMPLIED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT ExcludeModifier EMPTY>
+<!ATTLIST ExcludeModifier
+	code NMTOKEN #REQUIRED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT ValidModifierClass EMPTY>
+<!ATTLIST ValidModifierClass
+	code NMTOKEN #REQUIRED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT Rubric (
+Label+,
+History*)
+>
+<!ATTLIST Rubric
+	id ID #IMPLIED
+	kind IDREF #REQUIRED
+	usage IDREF #IMPLIED
+>
+
+<!ELEMENT Label (%rubric.complex;)*>
+<!ATTLIST Label
+	xml:lang NMTOKEN #REQUIRED
+xml:space (default|preserve) "default"
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT History (#PCDATA)>
+<!ATTLIST History
+	author IDREF #REQUIRED
+	date NMTOKEN #REQUIRED
+>
+
+<!ELEMENT SuperClass EMPTY>
+<!ATTLIST SuperClass
+	code CDATA #REQUIRED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT SubClass EMPTY>
+<!ATTLIST SubClass
+	code CDATA #REQUIRED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT Reference (#PCDATA)>
+<!ATTLIST Reference
+	class CDATA #IMPLIED
+	authority NMTOKEN #IMPLIED
+	uid NMTOKEN #IMPLIED
+	code CDATA #IMPLIED
+	usage IDREF #IMPLIED
+	variants IDREFS #IMPLIED
+>
+
+<!ELEMENT Para (%rubric.simple;)*>
+<!ATTLIST Para
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT Fragment (%rubric.simple;)*>
+<!ATTLIST Fragment
+	class CDATA #IMPLIED
+	usage IDREF #IMPLIED
+	type (item | list) "item"
+>
+
+<!ELEMENT Include EMPTY>
+<!ATTLIST Include
+	class CDATA #IMPLIED
+	rubric IDREF #REQUIRED
+>
+
+<!ELEMENT IncludeDescendants EMPTY>
+<!ATTLIST IncludeDescendants
+	code NMTOKEN #REQUIRED
+	kind IDREF #REQUIRED
+>
+
+<!ELEMENT List (ListItem+)>
+<!ATTLIST List
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT ListItem (
+%rubric.simple;
+| Para
+| Include
+| List
+| Table)*
+>
+<!ATTLIST ListItem
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT Table (
+Caption?,
+THead?,
+TBody?,
+TFoot?)
+>
+<!ATTLIST Table
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT Caption (%rubric.simple;)*>
+<!ATTLIST Caption
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT THead (Row+)>
+<!ATTLIST THead
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT TBody (Row+)>
+<!ATTLIST TBody
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT TFoot (Row+)>
+<!ATTLIST TFoot
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT Row (Cell*)>
+<!ATTLIST Row
+	class CDATA #IMPLIED
+>
+
+<!ELEMENT Cell (
+%rubric.simple;
+| Para
+| Include
+| List
+| Table)*
+>
+<!ATTLIST Cell
+	class CDATA #IMPLIED
+	rowspan CDATA #IMPLIED
+	colspan CDATA #IMPLIED
+>
+
+<!ELEMENT Term (#PCDATA)>
+<!ATTLIST Term
+	class CDATA #IMPLIED
+>	
+
--- a/referentials/sources/cim10_claml_2019_extracted/cim10_claml_2019.xml
+++ b/referentials/sources/cim10_claml_2019_extracted/cim10_claml_2019.xml
--- a/referentials/sources/cim_libelle.json.gz
+++ b/referentials/sources/cim_libelle.json.gz
--- a/referentials/sources/ghm_intermediaire.json.gz
+++ b/referentials/sources/ghm_intermediaire.json.gz
--- a/referentials/sources/ghs_prive.json.gz
+++ b/referentials/sources/ghs_prive.json.gz
--- a/referentials/sources/ghs_public.json.gz
+++ b/referentials/sources/ghs_public.json.gz
--- a/referentials/sources/regroupement_ghm_v2018.xlsx
+++ b/referentials/sources/regroupement_ghm_v2018.xlsx
--- a/referentials/sources/tarif_arrete_fev_2018.xlsx
+++ b/referentials/sources/tarif_arrete_fev_2018.xlsx
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_referentials.py
+++ b/tests/test_referentials.py
@@ -0,0 +1,160 @@
+"""Tests unitaires du module pipeline.referentials.
+
+Compatible pytest ET exécution directe (`python tests/test_referentials.py`).
+Nécessite que la base SQLite ait déjà été construite :
+    python -m pipeline.referentials --build
+"""
+from __future__ import annotations
+
+import sys
+from pathlib import Path
+
+# Permet l'exécution directe depuis tests/ sans installer le package.
+sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
+
+from pipeline.referentials import (  # noqa: E402
+    DB_PATH,
+    ghm_to_ghs,
+    get_cim10_libelle,
+    is_valid_ccam,
+    is_valid_cim10,
+    is_valid_ghm,
+    is_valid_ghs,
+    nearest_cim10,
+)
+
+
+# ---------------------------------------------------------------------------
+# CIM-10
+# ---------------------------------------------------------------------------
+
+def test_cim10_codes_valides():
+    """Quelques codes courants du référentiel PMSI."""
+    assert is_valid_cim10("K650")     # Péritonite aigüe
+    assert is_valid_cim10("T814")     # Infection après acte
+    assert is_valid_cim10("I10")      # Hypertension essentielle
+    assert is_valid_cim10("Z515")     # Soins palliatifs
+    assert is_valid_cim10("C509")     # Tumeur maligne du sein
+
+
+def test_cim10_normalisation():
+    """Tolère le point décimal (K65.0) et la casse (k650)."""
+    assert is_valid_cim10("K65.0")
+    assert is_valid_cim10("k650")
+    assert is_valid_cim10(" K650 ")
+
+
+def test_cim10_codes_invalides():
+    assert not is_valid_cim10("")
+    assert not is_valid_cim10("ZZZ99")
+    assert not is_valid_cim10("K6501234")   # trop long
+    assert not is_valid_cim10("1234")       # ne commence pas par lettre
+    assert not is_valid_cim10("K65X")       # suffixe non numérique
+
+
+def test_cim10_libelle():
+    lib = get_cim10_libelle("K650")
+    assert lib is not None and "éritonit" in lib.lower() or "peritonit" in lib.lower()
+
+
+def test_cim10_nearest_correction_ocr():
+    # O (lettre) lu au lieu de 0 (chiffre)
+    assert nearest_cim10("K65O") == "K650"
+    # Code déjà valide : renvoyé tel quel
+    assert nearest_cim10("K650") == "K650"
+    # Aucune correspondance à distance <= 1
+    assert nearest_cim10("ZZZZZ", max_distance=1) is None
+
+
+# ---------------------------------------------------------------------------
+# CCAM
+# ---------------------------------------------------------------------------
+
+def test_ccam_codes_valides():
+    assert is_valid_ccam("EBFA012")
+    assert is_valid_ccam("HFCC003")   # Bypass gastrique (mentionné dans la V5)
+    assert is_valid_ccam("ebfa012")   # casse insensible
+    assert is_valid_ccam("EBFA012-1") # extension PMSI tolérée
+
+
+def test_ccam_codes_invalides():
+    assert not is_valid_ccam("AAAA000")
+    assert not is_valid_ccam("")
+    assert not is_valid_ccam("EBF012")   # 3 lettres au lieu de 4
+
+
+# ---------------------------------------------------------------------------
+# GHM
+# ---------------------------------------------------------------------------
+
+def test_ghm_codes_valides():
+    assert is_valid_ghm("01C031")
+    assert is_valid_ghm("01c031")
+
+
+def test_ghm_codes_invalides():
+    assert not is_valid_ghm("99Z99Z")
+    assert not is_valid_ghm("")
+    assert not is_valid_ghm("ABCDEF")
+
+
+# ---------------------------------------------------------------------------
+# GHS et couplage GHM→GHS
+# ---------------------------------------------------------------------------
+
+def test_ghs_valide():
+    assert is_valid_ghs("22")
+    assert is_valid_ghs("0022")   # zéros de tête tolérés
+    assert not is_valid_ghs("99999")
+    assert not is_valid_ghs("")
+
+
+def test_ghm_to_ghs():
+    ghs = ghm_to_ghs("01C031")
+    assert "22" in ghs
+    # GHM inexistant → liste vide
+    assert ghm_to_ghs("99Z99Z") == []
+
+
+# ---------------------------------------------------------------------------
+# Exécution directe (sans pytest)
+# ---------------------------------------------------------------------------
+
+def _main() -> int:
+    import traceback
+    tests = [
+        ("test_cim10_codes_valides",         test_cim10_codes_valides),
+        ("test_cim10_normalisation",         test_cim10_normalisation),
+        ("test_cim10_codes_invalides",       test_cim10_codes_invalides),
+        ("test_cim10_libelle",               test_cim10_libelle),
+        ("test_cim10_nearest_correction_ocr", test_cim10_nearest_correction_ocr),
+        ("test_ccam_codes_valides",          test_ccam_codes_valides),
+        ("test_ccam_codes_invalides",        test_ccam_codes_invalides),
+        ("test_ghm_codes_valides",           test_ghm_codes_valides),
+        ("test_ghm_codes_invalides",         test_ghm_codes_invalides),
+        ("test_ghs_valide",                  test_ghs_valide),
+        ("test_ghm_to_ghs",                  test_ghm_to_ghs),
+    ]
+    if not DB_PATH.exists():
+        print(f"ERREUR : base SQLite manquante ({DB_PATH}).")
+        print("Exécute d'abord : python -m pipeline.referentials --build")
+        return 2
+
+    failures = 0
+    for name, fn in tests:
+        try:
+            fn()
+            print(f"  [OK ] {name}")
+        except AssertionError as e:
+            print(f"  [FAIL] {name} — {e}")
+            failures += 1
+        except Exception:
+            print(f"  [ERR] {name}")
+            traceback.print_exc()
+            failures += 1
+    print(f"=== {len(tests) - failures}/{len(tests)} tests OK ===")
+    return 0 if failures == 0 else 1
+
+
+if __name__ == "__main__":
+    sys.exit(_main())