feat(referentials): validation ATIH 2018 des codes médicaux

Ajoute une couche de validation post-extraction contre les référentiels officiels de l'ATIH (Agence Technique de l'Information sur l'Hospitalisation) pour 2018. Zéro tolérance sur les codes T2A : un code invalide est flaggé, et une correction par plus proche voisin (Levenshtein ≤ 1) est proposée. Contenu : - pipeline/referentials.py : API publique is_valid_{cim10,ccam,ghm,ghs}, get_cim10_libelle, nearest_cim10, ghm_to_ghs. CLI --build/--test/--stats. - pipeline/validation.py : annote un JSON d'extraction avec un bloc `_validation` par page (codes valides/invalides + suggestions + cross- checks GHM↔GHS). - referentials/sources/ : données brutes ATIH publiques (CIM-10 ClaML 2019 substitut, CCAM v5 2018, GHM v2018, tarifs fév. 2018). - referentials/atih_2018.sqlite : base SQLite prête à l'emploi (11 623 CIM-10 · 8 147 CCAM · 2 593 GHM · 5 329 couples GHM→GHS). - tests/test_referentials.py : 11 tests unitaires (11/11 passent). - annotate_validation.py : script qui annote tous les JSONs V2 en place et produit validation_report.md. Note CIM-10 : la version 2018 ATIH n'est publiée qu'en PDF, ClaML 2019 est utilisée en substitut (écart connu ≈ 60 codes / 11 600). Gestion des suffixes PMSI : `*` (CMA exclue par le DP) et `+N` (extension PMSI) sont strippés avant validation, le code racine seul est comparé au référentiel. Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
2026-04-24 15:06:01 +02:00
parent ed4d9bd765
commit 6df590ae95
17 changed files with 156052 additions and 0 deletions
--- a/annotate_validation.py
+++ b/annotate_validation.py
@@ -0,0 +1,136 @@
 """Annote les JSONs V2 existants avec la validation ATIH.
 Utile pour ajouter la validation sans relancer l'extraction complète.
 Produit aussi un rapport agrégé en markdown.
 """
 import json
 from collections import defaultdict
 from pathlib import Path
 from pipeline.validation import annotate
 OUT_DIR = Path("output/v2")
 REPORT = Path("validation_report.md")
 def annotate_all() -> list[dict]:
    """Annote chaque JSON et écrit le résultat en place (avec _validation)."""
    results = []
    for p in sorted(OUT_DIR.glob("OGC *.json")):
        data = json.loads(p.read_text(encoding="utf-8"))
        annotated = annotate(data)
        p.write_text(json.dumps(annotated, ensure_ascii=False, indent=2), encoding="utf-8")
        results.append(annotated)
        rec_v = annotated.get("extraction", {}).get("recueil", {}).get("_validation", {})
        s = rec_v.get("summary", {})
        cc = rec_v.get("cross_checks", {})
        print(f"  {data['fichier']:8s} — valid={s.get('valid',0):2d}  invalid={s.get('invalid',0):2d}  "
              f"empty={s.get('empty',0):2d}  incoherent={s.get('ghm_ghs_incoherents',0)}  "
              f"etab={cc.get('etab',{}).get('coherent','?')} reco={cc.get('reco',{}).get('coherent','?')}")
    return results
 def build_report(results: list[dict]):
    """Agrégation par champ : taux de validité, suggestions les plus fréquentes."""
    per_field = defaultdict(lambda: {"total": 0, "valid": 0, "invalid": 0, "empty": 0, "suggestions": []})
    incoherences = []
    for d in results:
        name = d["fichier"]
        rec_v = d.get("extraction", {}).get("recueil", {}).get("_validation", {})
        if not rec_v:
            continue
        # Codes unitaires
        for key in ["ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"]:
            entry = rec_v.get(key, {})
            st = per_field[key]
            st["total"] += 1
            if entry.get("valid") is True: st["valid"] += 1
            elif entry.get("valid") is False:
                st["invalid"] += 1
                if "suggestion" in entry:
                    st["suggestions"].append((name, entry["code"], entry["suggestion"]))
            else: st["empty"] += 1
        # Codage etab / reco : dp + dr + das
        for section in ["codage_etab", "codage_reco"]:
            sec = rec_v.get(section, {})
            for sub in ["dp", "dr"]:
                entry = sec.get(sub, {})
                st = per_field[f"{section}.{sub}"]
                st["total"] += 1
                if entry.get("valid") is True: st["valid"] += 1
                elif entry.get("valid") is False:
                    st["invalid"] += 1
                    if "suggestion" in entry:
                        st["suggestions"].append((name, entry["code"], entry["suggestion"]))
                else: st["empty"] += 1
            for das in sec.get("das", []) or []:
                st = per_field[f"{section}.das"]
                st["total"] += 1
                if das.get("valid") is True: st["valid"] += 1
                elif das.get("valid") is False:
                    st["invalid"] += 1
                    if "suggestion" in das:
                        st["suggestions"].append((name, das["code"], das["suggestion"]))
                else: st["empty"] += 1
        # Cohérence GHM ↔ GHS
        for side in ["etab", "reco"]:
            cc = rec_v.get("cross_checks", {}).get(side, {})
            if cc.get("checked") and not cc.get("coherent"):
                incoherences.append({
                    "dossier": name, "side": side,
                    "ghs_extrait": cc.get("ghs_extrait"),
                    "ghs_possibles": cc.get("ghs_possibles"),
                })
    # Markdown report
    lines = ["# Rapport de validation ATIH — V2 (18 dossiers)\n"]
    lines.append("## Couverture et validité par champ\n")
    lines.append("| Champ | Total | Valid | Invalid | Vide | Validité codes renseignés |")
    lines.append("|---|---:|---:|---:|---:|---:|")
    for f, st in per_field.items():
        renseignes = st["valid"] + st["invalid"]
        ratio = (100 * st["valid"] / renseignes) if renseignes else 0
        lines.append(f"| `{f}` | {st['total']} | {st['valid']} | {st['invalid']} | {st['empty']} | {ratio:.0f}% |")
    # Suggestions OCR
    lines.append("\n## Corrections OCR suggérées (Levenshtein ≤ 1)")
    lines.append("\nCodes extraits invalides mais ressemblant à un code ATIH existant :\n")
    lines.append("| Dossier | Champ | Code extrait | Suggestion |")
    lines.append("|---|---|---|---|")
    sugg_count = 0
    for field, st in per_field.items():
        for name, code, sug in st["suggestions"]:
            lines.append(f"| {name} | `{field}` | `{code}` | **`{sug}`** |")
            sugg_count += 1
    if sugg_count == 0:
        lines.append("| — | — | — | Aucune suggestion (pas de correction Levenshtein ≤ 1) |")
    # Incohérences GHM ↔ GHS
    lines.append("\n## Incohérences GHM ↔ GHS détectées\n")
    if incoherences:
        lines.append("| Dossier | Côté | GHS extrait | GHS possibles pour le GHM |")
        lines.append("|---|---|---|---|")
        for inc in incoherences:
            lines.append(f"| {inc['dossier']} | {inc['side']} | `{inc['ghs_extrait']}` | {inc['ghs_possibles']} |")
    else:
        lines.append("✓ Aucune incohérence détectée sur les GHM/GHS extraits.")
    lines.append(f"\n## Synthèse\n")
    total_codes = sum(st["valid"] + st["invalid"] for st in per_field.values())
    total_valid = sum(st["valid"] for st in per_field.values())
    lines.append(f"- **{total_valid}/{total_codes} codes valides** ({100*total_valid/total_codes:.1f}%)")
    lines.append(f"- **{sugg_count} suggestions de correction OCR** trouvées automatiquement")
    lines.append(f"- **{len(incoherences)} incohérences GHM↔GHS** sur les paires extraites")
    REPORT.write_text("\n".join(lines), encoding="utf-8")
    print(f"\nRapport → {REPORT}")
 if __name__ == "__main__":
    print("Annotation en place des JSONs V2 + calcul validation ATIH...\n")
    results = annotate_all()
    build_report(results)
--- a/pipeline/referentials.py
+++ b/pipeline/referentials.py
@@ -0,0 +1,597 @@
 """Validation des codes médicaux contre les référentiels ATIH 2018.
 Ce module charge les référentiels officiels ATIH (CIM-10, CCAM, GHM, table
 GHM→GHS) dans une base SQLite locale et expose des fonctions de validation
 pour les codes extraits par le pipeline OCR.
 Sources téléchargées (voir `referentials/sources/`) :
 - **CIM-10 FR 2019** au format ClaML XML (ATIH) — utilisée comme substitut
  à la CIM-10 2018 : ATIH ne publie officiellement la CIM-10 2018 qu'en PDF.
  L'écart entre CIM-10 2019 et CIM-10 2018 est < 100 codes sur ~11 600 ;
  un écart acceptable pour une validation OCR (et qui peut introduire
  quelques faux positifs pour des codes créés en 2019, mais jamais de faux
  négatifs sur un code 2018 valide).
 - **CCAM descriptive à usage PMSI 2018 V5** (XLSX ATIH).
 - **GHM V2018** (XLSX ATIH, fichier `regroupement_ghm_v2018.xlsx`).
 - **Arrêté tarifaire MCO Février 2018** (XLSX ATIH, feuilles "Tarifs public"
  et "Tarifs privé") pour la table GHM→GHS.
 Formats de codes supportés :
 - CIM-10 : lettre + 2 à 5 chiffres (ex: K650, T814, sans point).
 - CCAM   : 4 lettres + 3 chiffres (ex: EBFA012).
 - GHM    : 2 chiffres + lettre + 3 chiffres (ex: 11M122).
 - GHS    : nombre 1-5 chiffres (ex: 4323).
 Utilisation :
    from pipeline.referentials import (
        is_valid_cim10, is_valid_ccam, is_valid_ghm, is_valid_ghs,
        nearest_cim10, ghm_to_ghs, get_cim10_libelle,
    )
    if not is_valid_cim10("K650"):
        suggestion = nearest_cim10("K65O")  # correction O → 0
 Build initial de la base : ``python -m pipeline.referentials --build``
 Test rapide : ``python -m pipeline.referentials --test``
 """
 from __future__ import annotations
 import argparse
 import gzip
 import json
 import re
 import sqlite3
 import sys
 import xml.etree.ElementTree as ET
 from functools import lru_cache
 from pathlib import Path
 from typing import Iterable
 try:
    from rapidfuzz.distance import Levenshtein as _Lev
    _HAS_RAPIDFUZZ = True
 except ImportError:  # pragma: no cover - fallback pur Python
    _HAS_RAPIDFUZZ = False
 _ROOT = Path(__file__).resolve().parent.parent
 REFERENTIALS_DIR = _ROOT / "referentials"
 SOURCES_DIR = REFERENTIALS_DIR / "sources"
 DB_PATH = REFERENTIALS_DIR / "atih_2018.sqlite"
 # Formats attendus (utilisés pour normaliser l'entrée avant recherche DB)
 _RE_CIM10 = re.compile(r"^[A-Z][0-9]{2,5}$")
 _RE_CCAM = re.compile(r"^[A-Z]{4}[0-9]{3}$")
 _RE_GHM = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
 _RE_GHS = re.compile(r"^[0-9]{1,5}$")
 # ---------------------------------------------------------------------------
 # Normalisation des entrées (tolérante aux bruits OCR courants)
 # ---------------------------------------------------------------------------
 def _normalize_cim10(code: str) -> str:
    """Normalise un code CIM-10 extrait pour comparaison au référentiel.
    Gère :
    - Point décimal optionnel : "K65.0" → "K650"
    - Espaces / casse : " k650 " → "K650"
    - Suffixes PMSI : "C795 *" → "C795" (le `*` signifie "CMA exclue par le DP")
      et "K635+0" → "K635" (le `+N` est une extension PMSI à valider séparément)
    - Suffixe de position numérique éventuellement collé : "K650+" → "K650"
    """
    if not code:
        return ""
    s = code.strip().upper()
    # Couper à la première occurrence d'un marqueur PMSI non-alphanum
    # (*, +, #, espace suivi d'un marqueur). On garde uniquement la tête du code.
    for sep in ("*", "+", "#"):
        if sep in s:
            s = s.split(sep, 1)[0]
    return s.replace(".", "").replace(" ", "").strip()
 def _normalize_ccam(code: str) -> str:
    if not code:
        return ""
    # Retire éventuelle extension PMSI (-1, -2…) et les espaces
    base = code.split("-")[0]
    return base.replace(" ", "").strip().upper()
 def _normalize_ghm(code: str) -> str:
    if not code:
        return ""
    return code.replace(" ", "").strip().upper()
 def _normalize_ghs(code: str) -> str:
    if not code:
        return ""
    # Les GHS peuvent arriver en "0023" ou "23"
    s = re.sub(r"[^0-9]", "", code).lstrip("0")
    return s or "0"
 # ---------------------------------------------------------------------------
 # Construction de la base SQLite depuis les sources téléchargées
 # ---------------------------------------------------------------------------
 def _create_schema(conn: sqlite3.Connection) -> None:
    conn.executescript(
        """
        DROP TABLE IF EXISTS cim10;
        DROP TABLE IF EXISTS ccam;
        DROP TABLE IF EXISTS ghm;
        DROP TABLE IF EXISTS ghm_ghs;
        DROP TABLE IF EXISTS metadata;
        CREATE TABLE cim10 (
            code    TEXT PRIMARY KEY,
            libelle TEXT
        );
        CREATE TABLE ccam (
            code    TEXT PRIMARY KEY,
            libelle TEXT
        );
        CREATE TABLE ghm (
            code    TEXT PRIMARY KEY,
            libelle TEXT,
            aso     TEXT,
            da      TEXT
        );
        CREATE TABLE ghm_ghs (
            ghm     TEXT,
            ghs     TEXT,
            secteur TEXT,            -- 'public' ou 'prive'
            libelle TEXT,
            tarif   REAL,
            PRIMARY KEY (ghm, ghs, secteur)
        );
        CREATE INDEX idx_ghm_ghs_ghm ON ghm_ghs(ghm);
        CREATE INDEX idx_ghm_ghs_ghs ON ghm_ghs(ghs);
        CREATE TABLE metadata (
            key   TEXT PRIMARY KEY,
            value TEXT
        );
        """
    )
 def _load_cim10(conn: sqlite3.Connection) -> int:
    """Charge la CIM-10 FR depuis le ClaML XML (catégories uniquement)."""
    xml_path = SOURCES_DIR / "cim10_claml_2019_extracted" / "cim10_claml_2019.xml"
    if not xml_path.exists():
        # Fallback : chercher n'importe quel xml dans extracted
        xmls = list((SOURCES_DIR / "cim10_claml_2019_extracted").glob("*.xml"))
        if not xmls:
            raise FileNotFoundError(
                f"CIM-10 ClaML introuvable dans {SOURCES_DIR}. "
                f"Assurez-vous d'avoir téléchargé et extrait le zip ATIH."
            )
        xml_path = xmls[0]
    tree = ET.parse(xml_path)
    root = tree.getroot()
    rows: list[tuple[str, str]] = []
    for cls in root.findall(".//Class"):
        kind = cls.get("kind")
        if kind != "category":
            continue
        raw_code = cls.get("code") or ""
        code = raw_code.replace(".", "").upper().strip()
        if not code:
            continue
        pref = cls.find('.//Rubric[@kind="preferred"]/Label')
        libelle = pref.text.strip() if (pref is not None and pref.text) else ""
        rows.append((code, libelle))
    conn.executemany(
        "INSERT OR REPLACE INTO cim10 (code, libelle) VALUES (?, ?)", rows
    )
    return len(rows)
 def _load_ccam(conn: sqlite3.Connection) -> int:
    """Charge la CCAM 2018 depuis le XLSX ATIH (feuilles CCAM_Final_2018_*)."""
    import openpyxl
    xlsx_path = SOURCES_DIR / "ccam_2018_v5.xlsx"
    if not xlsx_path.exists():
        raise FileNotFoundError(f"CCAM XLSX introuvable : {xlsx_path}")
    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
    pat = re.compile(r"^[A-Z]{4}[0-9]{3}$")
    seen: dict[str, str] = {}
    for sheet_name in wb.sheetnames:
        if not sheet_name.startswith("CCAM_Final_2018"):
            continue
        ws = wb[sheet_name]
        cur_code: str | None = None
        for row in ws.iter_rows(values_only=True):
            # col 0 : parfois un code, col 3 : texte / libellé
            col0 = row[0] if len(row) > 0 else None
            col3 = row[3] if len(row) > 3 else None
            if isinstance(col0, str):
                c = col0.strip()
                if pat.match(c):
                    cur_code = c
                    if c not in seen:
                        seen[c] = ""
            if cur_code and isinstance(col3, str) and col3.strip():
                if not seen.get(cur_code):
                    seen[cur_code] = col3.strip()[:500]
    rows = list(seen.items())
    conn.executemany(
        "INSERT OR REPLACE INTO ccam (code, libelle) VALUES (?, ?)", rows
    )
    return len(rows)
 def _load_ghm(conn: sqlite3.Connection) -> int:
    """Charge les GHM V2018 depuis regroupement_ghm_v2018.xlsx."""
    import openpyxl
    xlsx_path = SOURCES_DIR / "regroupement_ghm_v2018.xlsx"
    if not xlsx_path.exists():
        raise FileNotFoundError(f"GHM XLSX introuvable : {xlsx_path}")
    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
    ws = wb[wb.sheetnames[0]]
    ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
    rows: list[tuple[str, str, str, str]] = []
    header_found = False
    for row in ws.iter_rows(values_only=True):
        if not header_found:
            if row and row[0] == "GHM":
                header_found = True
            continue
        code = row[0]
        if not isinstance(code, str):
            continue
        code = code.strip().upper()
        if not ghm_pat.match(code):
            continue
        libelle = (row[1] or "").strip() if isinstance(row[1], str) else ""
        aso = (row[2] or "").strip() if isinstance(row[2], str) else ""
        da = (row[3] or "").strip() if isinstance(row[3], str) else ""
        rows.append((code, libelle, aso, da))
    conn.executemany(
        "INSERT OR REPLACE INTO ghm (code, libelle, aso, da) VALUES (?, ?, ?, ?)",
        rows,
    )
    return len(rows)
 def _load_ghm_ghs(conn: sqlite3.Connection) -> int:
    """Charge la table GHM→GHS depuis tarif_arrete_fev_2018.xlsx.
    Feuilles "Tarifs public" (secteur='public') et "Tarifs privé"
    (secteur='prive'). Chaque ligne = un couple (GHS, GHM, libellé, tarif).
    """
    import openpyxl
    xlsx_path = SOURCES_DIR / "tarif_arrete_fev_2018.xlsx"
    if not xlsx_path.exists():
        raise FileNotFoundError(f"Tarifs XLSX introuvable : {xlsx_path}")
    wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
    ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
    all_rows: list[tuple[str, str, str, str, float | None]] = []
    for sheet_name, secteur in [("Tarifs public", "public"), ("Tarifs privé", "prive")]:
        if sheet_name not in wb.sheetnames:
            continue
        ws = wb[sheet_name]
        header_found = False
        for row in ws.iter_rows(values_only=True):
            if not header_found:
                if row and isinstance(row[0], str) and row[0].strip().upper() == "GHS":
                    header_found = True
                continue
            ghs_raw = row[0]
            ghm_raw = row[1] if len(row) > 1 else None
            lib_raw = row[2] if len(row) > 2 else None
            tarif_raw = row[5] if len(row) > 5 else None
            if ghs_raw is None or ghm_raw is None:
                continue
            try:
                ghs = str(int(float(ghs_raw)))
            except (ValueError, TypeError):
                continue
            ghm = str(ghm_raw).strip().upper()
            if not ghm_pat.match(ghm):
                continue
            libelle = str(lib_raw).strip() if lib_raw else ""
            try:
                tarif = float(tarif_raw) if tarif_raw is not None else None
            except (ValueError, TypeError):
                tarif = None
            all_rows.append((ghm, ghs, secteur, libelle, tarif))
    conn.executemany(
        "INSERT OR REPLACE INTO ghm_ghs (ghm, ghs, secteur, libelle, tarif) "
        "VALUES (?, ?, ?, ?, ?)",
        all_rows,
    )
    return len(all_rows)
 def build_database(db_path: Path = DB_PATH, verbose: bool = True) -> dict[str, int]:
    """Construit la base SQLite à partir des sources.
    Retourne les counts par table. Idempotent : DROP + CREATE + INSERT.
    """
    REFERENTIALS_DIR.mkdir(parents=True, exist_ok=True)
    conn = sqlite3.connect(db_path)
    try:
        _create_schema(conn)
        n_cim10 = _load_cim10(conn)
        if verbose:
            print(f"  CIM-10   : {n_cim10} codes chargés")
        n_ccam = _load_ccam(conn)
        if verbose:
            print(f"  CCAM     : {n_ccam} codes chargés")
        n_ghm = _load_ghm(conn)
        if verbose:
            print(f"  GHM      : {n_ghm} codes chargés")
        n_ghs = _load_ghm_ghs(conn)
        if verbose:
            print(f"  GHM→GHS  : {n_ghs} lignes (public+privé)")
        conn.executemany(
            "INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
            [
                ("source_cim10", "ATIH CIM-10 FR 2019 ClaML (substitut 2018)"),
                ("source_ccam", "ATIH CCAM descriptive à usage PMSI 2018 V5"),
                ("source_ghm", "ATIH regroupement_ghm_v2018.xlsx"),
                ("source_ghm_ghs", "ATIH tarif_arrete_fev_2018.xlsx"),
                ("n_cim10", str(n_cim10)),
                ("n_ccam", str(n_ccam)),
                ("n_ghm", str(n_ghm)),
                ("n_ghm_ghs", str(n_ghs)),
            ],
        )
        conn.commit()
        return {
            "cim10": n_cim10,
            "ccam": n_ccam,
            "ghm": n_ghm,
            "ghm_ghs": n_ghs,
        }
    finally:
        conn.close()
 # ---------------------------------------------------------------------------
 # Accès à la base (connexion cachée au niveau du module)
 # ---------------------------------------------------------------------------
 _CONN: sqlite3.Connection | None = None
 def _get_conn() -> sqlite3.Connection:
    global _CONN
    if _CONN is not None:
        return _CONN
    if not DB_PATH.exists():
        raise FileNotFoundError(
            f"Base SQLite introuvable : {DB_PATH}. "
            "Lancez d'abord : python -m pipeline.referentials --build"
        )
    _CONN = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, check_same_thread=False)
    return _CONN
 # ---------------------------------------------------------------------------
 # API publique de validation
 # ---------------------------------------------------------------------------
@lru_cache(maxsize=8192)
 def is_valid_cim10(code: str) -> bool:
    """Vérifie qu'un code CIM-10 existe dans le référentiel 2018 (substitut 2019)."""
    norm = _normalize_cim10(code)
    if not norm or not _RE_CIM10.match(norm):
        return False
    cur = _get_conn().execute("SELECT 1 FROM cim10 WHERE code = ? LIMIT 1", (norm,))
    return cur.fetchone() is not None
@lru_cache(maxsize=8192)
 def is_valid_ccam(code: str) -> bool:
    """Vérifie qu'un code CCAM existe dans la CCAM PMSI 2018."""
    norm = _normalize_ccam(code)
    if not norm or not _RE_CCAM.match(norm):
        return False
    cur = _get_conn().execute("SELECT 1 FROM ccam WHERE code = ? LIMIT 1", (norm,))
    return cur.fetchone() is not None
@lru_cache(maxsize=8192)
 def is_valid_ghm(code: str) -> bool:
    """Vérifie qu'un code GHM existe dans la V2018."""
    norm = _normalize_ghm(code)
    if not norm or not _RE_GHM.match(norm):
        return False
    cur = _get_conn().execute("SELECT 1 FROM ghm WHERE code = ? LIMIT 1", (norm,))
    return cur.fetchone() is not None
@lru_cache(maxsize=8192)
 def is_valid_ghs(code: str) -> bool:
    """Vérifie qu'un code GHS existe dans l'arrêté tarifaire 2018."""
    norm = _normalize_ghs(code)
    if not norm or not _RE_GHS.match(norm):
        return False
    cur = _get_conn().execute(
        "SELECT 1 FROM ghm_ghs WHERE ghs = ? LIMIT 1", (norm,)
    )
    return cur.fetchone() is not None
@lru_cache(maxsize=4096)
 def get_cim10_libelle(code: str) -> str | None:
    """Renvoie le libellé officiel du code CIM-10, ou None."""
    norm = _normalize_cim10(code)
    if not norm:
        return None
    cur = _get_conn().execute(
        "SELECT libelle FROM cim10 WHERE code = ? LIMIT 1", (norm,)
    )
    row = cur.fetchone()
    return row[0] if row else None
 def ghm_to_ghs(ghm: str) -> list[str]:
    """Renvoie les GHS possibles pour un GHM donné (publics et privés fusionnés).
    Utilisé pour vérifier la cohérence du couple (GHM, GHS) extrait.
    """
    norm = _normalize_ghm(ghm)
    if not norm:
        return []
    cur = _get_conn().execute(
        "SELECT DISTINCT ghs FROM ghm_ghs WHERE ghm = ?", (norm,)
    )
    return [r[0] for r in cur.fetchall()]
 def _levenshtein(a: str, b: str) -> int:
    if _HAS_RAPIDFUZZ:
        return _Lev.distance(a, b)
    # Fallback pur Python (O(n*m)) — suffisant pour des codes courts
    if len(a) < len(b):
        a, b = b, a
    if not b:
        return len(a)
    prev = list(range(len(b) + 1))
    for i, ca in enumerate(a, 1):
        cur = [i]
        for j, cb in enumerate(b, 1):
            ins = cur[j - 1] + 1
            dele = prev[j] + 1
            sub = prev[j - 1] + (ca != cb)
            cur.append(min(ins, dele, sub))
        prev = cur
    return prev[-1]
 def nearest_cim10(code: str, max_distance: int = 1) -> str | None:
    """Trouve le code CIM-10 valide le plus proche (distance de Levenshtein).
    Utile pour corriger les erreurs OCR courantes (O/0, I/1, B/8…).
    Stratégie de départage en cas d'égalité de distance :
      1. Privilégie un candidat de même longueur (substitution >> suppression)
      2. Sinon tri lexicographique croissant.
    Retourne None si aucun code n'est à ≤ max_distance.
    """
    norm = _normalize_cim10(code)
    if not norm:
        return None
    if is_valid_cim10(norm):
        return norm
    conn = _get_conn()
    length = len(norm)
    cur = conn.execute(
        "SELECT code FROM cim10 WHERE length(code) BETWEEN ? AND ?",
        (length - max_distance, length + max_distance),
    )
    candidates: list[tuple[int, int, str]] = []  # (distance, |len_diff|, code)
    for (cand,) in cur:
        d = _levenshtein(norm, cand)
        if d <= max_distance:
            candidates.append((d, abs(len(cand) - length), cand))
    if not candidates:
        return None
    # Tri : distance min, puis longueur la plus proche, puis lexicographique
    candidates.sort(key=lambda t: (t[0], t[1], t[2]))
    return candidates[0][2]
 # ---------------------------------------------------------------------------
 # Tests légers (exécutables sans pytest)
 # ---------------------------------------------------------------------------
 def _run_selftest() -> int:
    """Tests de fumée rapides. Retourne le nombre d'échecs."""
    failures = 0
    def check(label: str, cond: bool, detail: str = "") -> None:
        nonlocal failures
        status = "OK " if cond else "FAIL"
        print(f"  [{status}] {label}{(' — ' + detail) if detail else ''}")
        if not cond:
            failures += 1
    print("=== Tests référentiels ATIH 2018 ===")
    # CIM-10
    check("CIM-10 K650 valide (péritonite)", is_valid_cim10("K650"))
    check("CIM-10 K65.0 (avec point) valide", is_valid_cim10("K65.0"))
    check("CIM-10 T814 valide", is_valid_cim10("T814"))
    check("CIM-10 ZZZ99 invalide", not is_valid_cim10("ZZZ99"))
    check("CIM-10 libellé K650", get_cim10_libelle("K650") is not None,
          detail=str(get_cim10_libelle("K650")))
    # Correction OCR : K65O (lettre O) → K650
    suggestion = nearest_cim10("K65O")
    check("CIM-10 nearest(K65O) = K650", suggestion == "K650",
          detail=f"got={suggestion}")
    # CCAM
    check("CCAM EBFA012 valide", is_valid_ccam("EBFA012"))
    check("CCAM EBFA012-1 (ext PMSI) valide", is_valid_ccam("EBFA012-1"))
    check("CCAM AAAA000 invalide", not is_valid_ccam("AAAA000"))
    # GHM
    check("GHM 01C031 valide", is_valid_ghm("01C031"))
    check("GHM 99Z99Z invalide", not is_valid_ghm("99Z99Z"))
    # GHS
    check("GHS 22 valide", is_valid_ghs("22"))
    check("GHS 99999 invalide", not is_valid_ghs("99999"))
    # GHM→GHS
    ghs_list = ghm_to_ghs("01C031")
    check("GHM 01C031 → GHS inclut 22", "22" in ghs_list,
          detail=f"ghs_list={ghs_list}")
    # Format invalide (robustesse)
    check("is_valid_cim10('') = False", not is_valid_cim10(""))
    check("is_valid_ccam(None cast) = False", not is_valid_ccam(""))
    print(f"=== Résultat : {failures} échec(s) ===")
    return failures
 def _cli() -> int:
    parser = argparse.ArgumentParser(description="Référentiels ATIH 2018")
    g = parser.add_mutually_exclusive_group(required=True)
    g.add_argument("--build", action="store_true",
                   help="(Re)construit la base SQLite depuis referentials/sources/")
    g.add_argument("--test", action="store_true",
                   help="Exécute les tests de fumée")
    g.add_argument("--stats", action="store_true",
                   help="Affiche les comptages de la base")
    args = parser.parse_args()
    if args.build:
        print(f"Construction de {DB_PATH} depuis {SOURCES_DIR}...")
        counts = build_database()
        print("OK :", counts)
        return 0
    if args.test:
        return 1 if _run_selftest() > 0 else 0
    if args.stats:
        conn = _get_conn()
        for tbl in ("cim10", "ccam", "ghm", "ghm_ghs"):
            n = conn.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0]
            print(f"  {tbl:10s}: {n}")
        print("Metadata :")
        for k, v in conn.execute("SELECT key, value FROM metadata"):
            print(f"  {k}: {v}")
        return 0
    return 0
 if __name__ == "__main__":
    sys.exit(_cli())
--- a/pipeline/validation.py
+++ b/pipeline/validation.py
@@ -0,0 +1,217 @@
 """Validation ATIH des codes extraits.
 Prend un JSON d'extraction produit par `pipeline/extract.py` et l'enrichit
 d'une section `_validation` par champ de code médical (CIM-10, CCAM, GHM, GHS)
 avec :
 - `valid`  : le code existe dans le référentiel ATIH 2018
 - `suggestion` : si invalide, code le plus proche par Levenshtein ≤ 1 (CIM-10)
 - `libelle_ref` : libellé officiel ATIH (CIM-10) pour audit
 Plus des cross-checks (GHS ∈ ghm_to_ghs(GHM)) pour détecter des incohérences
 de groupage.
 Principes :
 - Lecture seule sur le JSON source — on produit une COPIE enrichie.
 - Ne supprime / ne corrige RIEN automatiquement ; seule une suggestion est
  annotée. La correction reste à la discrétion d'un humain (overlay) ou d'un
  prochain pass automatique.
 """
 from __future__ import annotations
 from copy import deepcopy
 from typing import Any
 from .referentials import (
    get_cim10_libelle,
    ghm_to_ghs,
    is_valid_ccam,
    is_valid_cim10,
    is_valid_ghm,
    is_valid_ghs,
    nearest_cim10,
 )
 # ============================================================
 # Helpers
 # ============================================================
 def _check_cim10(code: str) -> dict:
    """Valide un code CIM-10 et suggère une correction si invalide."""
    code = (code or "").strip()
    if not code:
        return {"code": "", "valid": None}
    valid = is_valid_cim10(code)
    entry = {"code": code, "valid": valid}
    if valid:
        entry["libelle_ref"] = get_cim10_libelle(code)
    else:
        sug = nearest_cim10(code, max_distance=1)
        if sug:
            entry["suggestion"] = sug
            entry["suggestion_libelle"] = get_cim10_libelle(sug)
    return entry
 def _check_ccam(code: str) -> dict:
    code = (code or "").strip()
    if not code:
        return {"code": "", "valid": None}
    return {"code": code, "valid": is_valid_ccam(code)}
 def _check_ghm(code: str) -> dict:
    code = (code or "").strip()
    if not code:
        return {"code": "", "valid": None}
    entry = {"code": code, "valid": is_valid_ghm(code)}
    if entry["valid"]:
        entry["ghs_possibles"] = ghm_to_ghs(code)
    return entry
 def _check_ghs(code: str) -> dict:
    code = (code or "").strip()
    if not code:
        return {"code": "", "valid": None}
    return {"code": code, "valid": is_valid_ghs(code)}
 # ============================================================
 # Validation d'un bloc codage (etab ou reco)
 # ============================================================
 def _validate_codage(codage: dict) -> dict:
    """Valide un bloc codage_etab ou codage_reco."""
    if not isinstance(codage, dict):
        return {}
    out = {
        "dp": _check_cim10(codage.get("dp", "")),
        "dr": _check_cim10(codage.get("dr", "")),
    }
    das_list = codage.get("das") or []
    if isinstance(das_list, list):
        out["das"] = [_check_cim10(d.get("code", "")) if isinstance(d, dict) else _check_cim10(str(d))
                      for d in das_list]
    return out
 def _validate_actes(actes: Any) -> list[dict]:
    if not isinstance(actes, list):
        return []
    return [_check_ccam(a.get("code", "")) if isinstance(a, dict) else _check_ccam(str(a))
            for a in actes]
 # ============================================================
 # Cross-checks GHM ↔ GHS
 # ============================================================
 def _cross_check_ghm_ghs(ghm: str, ghs: str) -> dict:
    """Vérifie qu'un GHS observé est listé parmi les GHS possibles du GHM."""
    ghm = (ghm or "").strip()
    ghs = (ghs or "").strip()
    if not ghm or not ghs:
        return {"checked": False, "reason": "ghm ou ghs manquant"}
    if not is_valid_ghm(ghm):
        return {"checked": False, "reason": "GHM invalide"}
    possibles = ghm_to_ghs(ghm)
    # Normalisation simple : on compare la fin (au cas où l'un est tronqué)
    ok = ghs in possibles or any(p.endswith(ghs) or ghs.endswith(p) for p in possibles)
    return {
        "checked": True,
        "coherent": ok,
        "ghs_extrait": ghs,
        "ghs_possibles": possibles,
    }
 # ============================================================
 # Point d'entrée
 # ============================================================
 def validate_recueil(recueil: dict) -> dict:
    """Retourne un dict résumé des validations pour la page recueil."""
    v = {
        "codage_etab": _validate_codage(recueil.get("codage_etab", {})),
        "codage_reco": _validate_codage(recueil.get("codage_reco", {})),
        "actes_etab": _validate_actes(recueil.get("actes_etab", [])),
        "actes_reco": _validate_actes(recueil.get("actes_reco", [])),
        "ghm_etab": _check_ghm(recueil.get("ghm_etab", "")),
        "ghs_etab": _check_ghs(recueil.get("ghs_etab", "")),
        "ghm_reco": _check_ghm(recueil.get("ghm_reco", "")),
        "ghs_reco": _check_ghs(recueil.get("ghs_reco", "")),
        "cross_checks": {
            "etab": _cross_check_ghm_ghs(
                recueil.get("ghm_etab", ""), recueil.get("ghs_etab", "")),
            "reco": _cross_check_ghm_ghs(
                recueil.get("ghm_reco", ""), recueil.get("ghs_reco", "")),
        },
    }
    v["summary"] = _summarize(v)
    return v
 def _summarize(validation: dict) -> dict:
    """Compte les codes valides / invalides dans une section _validation."""
    valid, invalid, empty = 0, 0, 0
    def _count_entry(e):
        nonlocal valid, invalid, empty
        if e.get("valid") is True: valid += 1
        elif e.get("valid") is False: invalid += 1
        else: empty += 1
    for section in ("codage_etab", "codage_reco"):
        sec = validation.get(section, {}) or {}
        _count_entry(sec.get("dp", {}))
        _count_entry(sec.get("dr", {}))
        for d in sec.get("das", []) or []:
            _count_entry(d)
    for actes_key in ("actes_etab", "actes_reco"):
        for a in validation.get(actes_key, []) or []:
            _count_entry(a)
    for g in ("ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"):
        _count_entry(validation.get(g, {}))
    cc = validation.get("cross_checks", {})
    incoherent = sum(1 for v in cc.values() if v.get("checked") and not v.get("coherent"))
    return {
        "valid": valid, "invalid": invalid, "empty": empty,
        "total_codes": valid + invalid,
        "ghm_ghs_incoherents": incoherent,
    }
 def annotate(extraction: dict) -> dict:
    """Annote un JSON d'extraction complet avec validation ATIH.
    Retourne une COPIE enrichie d'un bloc `_validation` à la racine de chaque
    page structurée. N'efface / ne corrige aucune valeur.
    """
    out = deepcopy(extraction)
    ext = out.get("extraction") or {}
    if "recueil" in ext and isinstance(ext["recueil"], dict):
        ext["recueil"]["_validation"] = validate_recueil(ext["recueil"])
    # Concertation 2 : valider les 3 GHS
    if "concertation_2" in ext and isinstance(ext["concertation_2"], dict):
        c2 = ext["concertation_2"]
        c2["_validation"] = {
            "ghs_initial":            _check_ghs(c2.get("ghs_initial", "")),
            "ghs_avant_concertation": _check_ghs(c2.get("ghs_avant_concertation", "")),
            "ghs_final":              _check_ghs(c2.get("ghs_final", "")),
        }
    return out
 if __name__ == "__main__":
    # Test rapide sur OGC 7
    import json, sys
    path = sys.argv[1] if len(sys.argv) > 1 else "output/v2/OGC 7.json"
    with open(path) as f:
        data = json.load(f)
    annotated = annotate(data)
    rec_v = annotated["extraction"]["recueil"]["_validation"]
    print(json.dumps(rec_v["summary"], indent=2))
    print("\ncross_checks:", json.dumps(rec_v["cross_checks"], indent=2, ensure_ascii=False))
--- a/referentials/atih_2018.sqlite
+++ b/referentials/atih_2018.sqlite
--- a/referentials/sources/ccam_2018_v5.xlsx
+++ b/referentials/sources/ccam_2018_v5.xlsx
--- a/referentials/sources/cim.json.gz
+++ b/referentials/sources/cim.json.gz
--- a/referentials/sources/cim10_claml_2019.zip
+++ b/referentials/sources/cim10_claml_2019.zip
--- a/referentials/sources/cim10_claml_2019_extracted/ClaML.dtd
+++ b/referentials/sources/cim10_claml_2019_extracted/ClaML.dtd
@@ -0,0 +1,283 @@
 <?xml version="1.0" encoding="UTF-8"?>
 <!ENTITY % rubric.simple "#PCDATA | Reference | Term">
 <!ENTITY % rubric.complex "%rubric.simple; | Para | Include | 
 	IncludeDescendants| Fragment | List | Table">
 <!ELEMENT ClaML (
 	Meta*,
 	Identifier*,
 	Title,
 	Authors?,
 	Variants?,
 	ClassKinds,
 UsageKinds?,
 	RubricKinds,
 	Modifier*,
 	ModifierClass*,
 	Class*)
 >
 <!ATTLIST ClaML
 	version CDATA #REQUIRED
 >
 <!ELEMENT Variants (Variant+)>
 <!ELEMENT Variant (#PCDATA)>
 <!ATTLIST Variant
 	name ID #REQUIRED
 >
 <!ELEMENT Meta EMPTY>
 <!ATTLIST Meta
 	name CDATA #REQUIRED
 	value CDATA #REQUIRED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT Identifier EMPTY>
 <!ATTLIST Identifier
 	authority NMTOKEN #IMPLIED
 	uid CDATA #REQUIRED
 >
 <!ELEMENT Title (#PCDATA)>
 <!ATTLIST Title
 	name NMTOKEN #REQUIRED
 	version CDATA #IMPLIED
 	date CDATA #IMPLIED
 >
 <!ELEMENT Authors (Author* )>
 <!ELEMENT Author (#PCDATA)>
 <!ATTLIST Author
 	name ID #REQUIRED
 >
 <!ELEMENT ClassKinds (ClassKind+)>
 <!ELEMENT RubricKinds (RubricKind+)>
 <!ELEMENT UsageKinds (UsageKind+)>
 <!ELEMENT ClassKind (Display*)>
 <!ATTLIST ClassKind
 	name ID #REQUIRED
 >
 <!ELEMENT RubricKind (Display*)>
 <!ATTLIST RubricKind
 	name ID #REQUIRED
 	inherited (true|false) "true"
 >
 <!ELEMENT UsageKind EMPTY>
 <!ATTLIST UsageKind
 	name ID #REQUIRED
 	mark CDATA #REQUIRED
 >
 <!ELEMENT Display (#PCDATA)>
 <!ATTLIST Display
 	xml:lang NMTOKEN #REQUIRED
 	variants IDREF #IMPLIED
 >
 <!ELEMENT Modifier (
 	Meta*,
 	SubClass*,
 	Rubric*,
 History*)
 >
 <!ATTLIST Modifier
 	code NMTOKEN #REQUIRED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT ModifierClass (
 	Meta*,
 	SuperClass,
 	SubClass*,
 	Rubric*,
 History*)
 >
 <!ATTLIST ModifierClass
 	modifier NMTOKEN #REQUIRED
 	code NMTOKEN #REQUIRED
 	usage IDREF #IMPLIED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT Class (
 	Meta*,
 	SuperClass*,
 	SubClass*,
 	ModifiedBy*,
 	ExcludeModifier*,
 	Rubric*,
 History*)
 >
 <!ATTLIST Class
 	code CDATA #REQUIRED
 	kind IDREF #REQUIRED
 	usage IDREF #IMPLIED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT ModifiedBy (
 	Meta*,
 ValidModifierClass*)
 >
 <!ATTLIST ModifiedBy
 	code NMTOKEN #REQUIRED
 	all (true|false) "true"
 	position CDATA #IMPLIED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT ExcludeModifier EMPTY>
 <!ATTLIST ExcludeModifier
 	code NMTOKEN #REQUIRED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT ValidModifierClass EMPTY>
 <!ATTLIST ValidModifierClass
 	code NMTOKEN #REQUIRED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT Rubric (
 Label+,
 History*)
 >
 <!ATTLIST Rubric
 	id ID #IMPLIED
 	kind IDREF #REQUIRED
 	usage IDREF #IMPLIED
 >
 <!ELEMENT Label (%rubric.complex;)*>
 <!ATTLIST Label
 	xml:lang NMTOKEN #REQUIRED
 xml:space (default|preserve) "default"
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT History (#PCDATA)>
 <!ATTLIST History
 	author IDREF #REQUIRED
 	date NMTOKEN #REQUIRED
 >
 <!ELEMENT SuperClass EMPTY>
 <!ATTLIST SuperClass
 	code CDATA #REQUIRED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT SubClass EMPTY>
 <!ATTLIST SubClass
 	code CDATA #REQUIRED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT Reference (#PCDATA)>
 <!ATTLIST Reference
 	class CDATA #IMPLIED
 	authority NMTOKEN #IMPLIED
 	uid NMTOKEN #IMPLIED
 	code CDATA #IMPLIED
 	usage IDREF #IMPLIED
 	variants IDREFS #IMPLIED
 >
 <!ELEMENT Para (%rubric.simple;)*>
 <!ATTLIST Para
 	class CDATA #IMPLIED
 >
 <!ELEMENT Fragment (%rubric.simple;)*>
 <!ATTLIST Fragment
 	class CDATA #IMPLIED
 	usage IDREF #IMPLIED
 	type (item | list) "item"
 >
 <!ELEMENT Include EMPTY>
 <!ATTLIST Include
 	class CDATA #IMPLIED
 	rubric IDREF #REQUIRED
 >
 <!ELEMENT IncludeDescendants EMPTY>
 <!ATTLIST IncludeDescendants
 	code NMTOKEN #REQUIRED
 	kind IDREF #REQUIRED
 >
 <!ELEMENT List (ListItem+)>
 <!ATTLIST List
 	class CDATA #IMPLIED
 >
 <!ELEMENT ListItem (
 %rubric.simple;
 | Para
 | Include
 | List
 | Table)*
 >
 <!ATTLIST ListItem
 	class CDATA #IMPLIED
 >
 <!ELEMENT Table (
 Caption?,
 THead?,
 TBody?,
 TFoot?)
 >
 <!ATTLIST Table
 	class CDATA #IMPLIED
 >
 <!ELEMENT Caption (%rubric.simple;)*>
 <!ATTLIST Caption
 	class CDATA #IMPLIED
 >
 <!ELEMENT THead (Row+)>
 <!ATTLIST THead
 	class CDATA #IMPLIED
 >
 <!ELEMENT TBody (Row+)>
 <!ATTLIST TBody
 	class CDATA #IMPLIED
 >
 <!ELEMENT TFoot (Row+)>
 <!ATTLIST TFoot
 	class CDATA #IMPLIED
 >
 <!ELEMENT Row (Cell*)>
 <!ATTLIST Row
 	class CDATA #IMPLIED
 >
 <!ELEMENT Cell (
 %rubric.simple;
 | Para
 | Include
 | List
 | Table)*
 >
 <!ATTLIST Cell
 	class CDATA #IMPLIED
 	rowspan CDATA #IMPLIED
 	colspan CDATA #IMPLIED
 >
 <!ELEMENT Term (#PCDATA)>
 <!ATTLIST Term
 	class CDATA #IMPLIED
 >	
--- a/referentials/sources/cim10_claml_2019_extracted/cim10_claml_2019.xml
+++ b/referentials/sources/cim10_claml_2019_extracted/cim10_claml_2019.xml
--- a/referentials/sources/cim_libelle.json.gz
+++ b/referentials/sources/cim_libelle.json.gz
--- a/referentials/sources/ghm_intermediaire.json.gz
+++ b/referentials/sources/ghm_intermediaire.json.gz
--- a/referentials/sources/ghs_prive.json.gz
+++ b/referentials/sources/ghs_prive.json.gz
--- a/referentials/sources/ghs_public.json.gz
+++ b/referentials/sources/ghs_public.json.gz
--- a/referentials/sources/regroupement_ghm_v2018.xlsx
+++ b/referentials/sources/regroupement_ghm_v2018.xlsx
--- a/referentials/sources/tarif_arrete_fev_2018.xlsx
+++ b/referentials/sources/tarif_arrete_fev_2018.xlsx
--- a/tests/init.py
+++ b/tests/init.py
--- a/tests/test_referentials.py
+++ b/tests/test_referentials.py
@@ -0,0 +1,160 @@
 """Tests unitaires du module pipeline.referentials.
 Compatible pytest ET exécution directe (`python tests/test_referentials.py`).
 Nécessite que la base SQLite ait déjà été construite :
    python -m pipeline.referentials --build
 """
 from __future__ import annotations
 import sys
 from pathlib import Path
 # Permet l'exécution directe depuis tests/ sans installer le package.
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 from pipeline.referentials import (  # noqa: E402
    DB_PATH,
    ghm_to_ghs,
    get_cim10_libelle,
    is_valid_ccam,
    is_valid_cim10,
    is_valid_ghm,
    is_valid_ghs,
    nearest_cim10,
 )
 # ---------------------------------------------------------------------------
 # CIM-10
 # ---------------------------------------------------------------------------
 def test_cim10_codes_valides():
    """Quelques codes courants du référentiel PMSI."""
    assert is_valid_cim10("K650")     # Péritonite aigüe
    assert is_valid_cim10("T814")     # Infection après acte
    assert is_valid_cim10("I10")      # Hypertension essentielle
    assert is_valid_cim10("Z515")     # Soins palliatifs
    assert is_valid_cim10("C509")     # Tumeur maligne du sein
 def test_cim10_normalisation():
    """Tolère le point décimal (K65.0) et la casse (k650)."""
    assert is_valid_cim10("K65.0")
    assert is_valid_cim10("k650")
    assert is_valid_cim10(" K650 ")
 def test_cim10_codes_invalides():
    assert not is_valid_cim10("")
    assert not is_valid_cim10("ZZZ99")
    assert not is_valid_cim10("K6501234")   # trop long
    assert not is_valid_cim10("1234")       # ne commence pas par lettre
    assert not is_valid_cim10("K65X")       # suffixe non numérique
 def test_cim10_libelle():
    lib = get_cim10_libelle("K650")
    assert lib is not None and "éritonit" in lib.lower() or "peritonit" in lib.lower()
 def test_cim10_nearest_correction_ocr():
    # O (lettre) lu au lieu de 0 (chiffre)
    assert nearest_cim10("K65O") == "K650"
    # Code déjà valide : renvoyé tel quel
    assert nearest_cim10("K650") == "K650"
    # Aucune correspondance à distance <= 1
    assert nearest_cim10("ZZZZZ", max_distance=1) is None
 # ---------------------------------------------------------------------------
 # CCAM
 # ---------------------------------------------------------------------------
 def test_ccam_codes_valides():
    assert is_valid_ccam("EBFA012")
    assert is_valid_ccam("HFCC003")   # Bypass gastrique (mentionné dans la V5)
    assert is_valid_ccam("ebfa012")   # casse insensible
    assert is_valid_ccam("EBFA012-1") # extension PMSI tolérée
 def test_ccam_codes_invalides():
    assert not is_valid_ccam("AAAA000")
    assert not is_valid_ccam("")
    assert not is_valid_ccam("EBF012")   # 3 lettres au lieu de 4
 # ---------------------------------------------------------------------------
 # GHM
 # ---------------------------------------------------------------------------
 def test_ghm_codes_valides():
    assert is_valid_ghm("01C031")
    assert is_valid_ghm("01c031")
 def test_ghm_codes_invalides():
    assert not is_valid_ghm("99Z99Z")
    assert not is_valid_ghm("")
    assert not is_valid_ghm("ABCDEF")
 # ---------------------------------------------------------------------------
 # GHS et couplage GHM→GHS
 # ---------------------------------------------------------------------------
 def test_ghs_valide():
    assert is_valid_ghs("22")
    assert is_valid_ghs("0022")   # zéros de tête tolérés
    assert not is_valid_ghs("99999")
    assert not is_valid_ghs("")
 def test_ghm_to_ghs():
    ghs = ghm_to_ghs("01C031")
    assert "22" in ghs
    # GHM inexistant → liste vide
    assert ghm_to_ghs("99Z99Z") == []
 # ---------------------------------------------------------------------------
 # Exécution directe (sans pytest)
 # ---------------------------------------------------------------------------
 def _main() -> int:
    import traceback
    tests = [
        ("test_cim10_codes_valides",         test_cim10_codes_valides),
        ("test_cim10_normalisation",         test_cim10_normalisation),
        ("test_cim10_codes_invalides",       test_cim10_codes_invalides),
        ("test_cim10_libelle",               test_cim10_libelle),
        ("test_cim10_nearest_correction_ocr", test_cim10_nearest_correction_ocr),
        ("test_ccam_codes_valides",          test_ccam_codes_valides),
        ("test_ccam_codes_invalides",        test_ccam_codes_invalides),
        ("test_ghm_codes_valides",           test_ghm_codes_valides),
        ("test_ghm_codes_invalides",         test_ghm_codes_invalides),
        ("test_ghs_valide",                  test_ghs_valide),
        ("test_ghm_to_ghs",                  test_ghm_to_ghs),
    ]
    if not DB_PATH.exists():
        print(f"ERREUR : base SQLite manquante ({DB_PATH}).")
        print("Exécute d'abord : python -m pipeline.referentials --build")
        return 2
    failures = 0
    for name, fn in tests:
        try:
            fn()
            print(f"  [OK ] {name}")
        except AssertionError as e:
            print(f"  [FAIL] {name} — {e}")
            failures += 1
        except Exception:
            print(f"  [ERR] {name}")
            traceback.print_exc()
            failures += 1
    print(f"=== {len(tests) - failures}/{len(tests)} tests OK ===")
    return 0 if failures == 0 else 1
 if __name__ == "__main__":
    sys.exit(_main())