feat(referentials): validation ATIH 2018 des codes médicaux

Ajoute une couche de validation post-extraction contre les référentiels
officiels de l'ATIH (Agence Technique de l'Information sur
l'Hospitalisation) pour 2018. Zéro tolérance sur les codes T2A : un
code invalide est flaggé, et une correction par plus proche voisin
(Levenshtein ≤ 1) est proposée.

Contenu :
- pipeline/referentials.py : API publique is_valid_{cim10,ccam,ghm,ghs},
  get_cim10_libelle, nearest_cim10, ghm_to_ghs. CLI --build/--test/--stats.
- pipeline/validation.py    : annote un JSON d'extraction avec un bloc
  `_validation` par page (codes valides/invalides + suggestions + cross-
  checks GHM↔GHS).
- referentials/sources/     : données brutes ATIH publiques (CIM-10 ClaML
  2019 substitut, CCAM v5 2018, GHM v2018, tarifs fév. 2018).
- referentials/atih_2018.sqlite : base SQLite prête à l'emploi
  (11 623 CIM-10 · 8 147 CCAM · 2 593 GHM · 5 329 couples GHM→GHS).
- tests/test_referentials.py : 11 tests unitaires (11/11 passent).
- annotate_validation.py    : script qui annote tous les JSONs V2 en
  place et produit validation_report.md.

Note CIM-10 : la version 2018 ATIH n'est publiée qu'en PDF, ClaML 2019
est utilisée en substitut (écart connu ≈ 60 codes / 11 600).

Gestion des suffixes PMSI : `*` (CMA exclue par le DP) et `+N`
(extension PMSI) sont strippés avant validation, le code racine seul
est comparé au référentiel.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
Dom
2026-04-24 15:06:01 +02:00
parent ed4d9bd765
commit 6df590ae95
17 changed files with 156052 additions and 0 deletions

136
annotate_validation.py Normal file
View File

@@ -0,0 +1,136 @@
"""Annote les JSONs V2 existants avec la validation ATIH.
Utile pour ajouter la validation sans relancer l'extraction complète.
Produit aussi un rapport agrégé en markdown.
"""
import json
from collections import defaultdict
from pathlib import Path
from pipeline.validation import annotate
OUT_DIR = Path("output/v2")
REPORT = Path("validation_report.md")
def annotate_all() -> list[dict]:
"""Annote chaque JSON et écrit le résultat en place (avec _validation)."""
results = []
for p in sorted(OUT_DIR.glob("OGC *.json")):
data = json.loads(p.read_text(encoding="utf-8"))
annotated = annotate(data)
p.write_text(json.dumps(annotated, ensure_ascii=False, indent=2), encoding="utf-8")
results.append(annotated)
rec_v = annotated.get("extraction", {}).get("recueil", {}).get("_validation", {})
s = rec_v.get("summary", {})
cc = rec_v.get("cross_checks", {})
print(f" {data['fichier']:8s} — valid={s.get('valid',0):2d} invalid={s.get('invalid',0):2d} "
f"empty={s.get('empty',0):2d} incoherent={s.get('ghm_ghs_incoherents',0)} "
f"etab={cc.get('etab',{}).get('coherent','?')} reco={cc.get('reco',{}).get('coherent','?')}")
return results
def build_report(results: list[dict]):
"""Agrégation par champ : taux de validité, suggestions les plus fréquentes."""
per_field = defaultdict(lambda: {"total": 0, "valid": 0, "invalid": 0, "empty": 0, "suggestions": []})
incoherences = []
for d in results:
name = d["fichier"]
rec_v = d.get("extraction", {}).get("recueil", {}).get("_validation", {})
if not rec_v:
continue
# Codes unitaires
for key in ["ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"]:
entry = rec_v.get(key, {})
st = per_field[key]
st["total"] += 1
if entry.get("valid") is True: st["valid"] += 1
elif entry.get("valid") is False:
st["invalid"] += 1
if "suggestion" in entry:
st["suggestions"].append((name, entry["code"], entry["suggestion"]))
else: st["empty"] += 1
# Codage etab / reco : dp + dr + das
for section in ["codage_etab", "codage_reco"]:
sec = rec_v.get(section, {})
for sub in ["dp", "dr"]:
entry = sec.get(sub, {})
st = per_field[f"{section}.{sub}"]
st["total"] += 1
if entry.get("valid") is True: st["valid"] += 1
elif entry.get("valid") is False:
st["invalid"] += 1
if "suggestion" in entry:
st["suggestions"].append((name, entry["code"], entry["suggestion"]))
else: st["empty"] += 1
for das in sec.get("das", []) or []:
st = per_field[f"{section}.das"]
st["total"] += 1
if das.get("valid") is True: st["valid"] += 1
elif das.get("valid") is False:
st["invalid"] += 1
if "suggestion" in das:
st["suggestions"].append((name, das["code"], das["suggestion"]))
else: st["empty"] += 1
# Cohérence GHM ↔ GHS
for side in ["etab", "reco"]:
cc = rec_v.get("cross_checks", {}).get(side, {})
if cc.get("checked") and not cc.get("coherent"):
incoherences.append({
"dossier": name, "side": side,
"ghs_extrait": cc.get("ghs_extrait"),
"ghs_possibles": cc.get("ghs_possibles"),
})
# Markdown report
lines = ["# Rapport de validation ATIH — V2 (18 dossiers)\n"]
lines.append("## Couverture et validité par champ\n")
lines.append("| Champ | Total | Valid | Invalid | Vide | Validité codes renseignés |")
lines.append("|---|---:|---:|---:|---:|---:|")
for f, st in per_field.items():
renseignes = st["valid"] + st["invalid"]
ratio = (100 * st["valid"] / renseignes) if renseignes else 0
lines.append(f"| `{f}` | {st['total']} | {st['valid']} | {st['invalid']} | {st['empty']} | {ratio:.0f}% |")
# Suggestions OCR
lines.append("\n## Corrections OCR suggérées (Levenshtein ≤ 1)")
lines.append("\nCodes extraits invalides mais ressemblant à un code ATIH existant :\n")
lines.append("| Dossier | Champ | Code extrait | Suggestion |")
lines.append("|---|---|---|---|")
sugg_count = 0
for field, st in per_field.items():
for name, code, sug in st["suggestions"]:
lines.append(f"| {name} | `{field}` | `{code}` | **`{sug}`** |")
sugg_count += 1
if sugg_count == 0:
lines.append("| — | — | — | Aucune suggestion (pas de correction Levenshtein ≤ 1) |")
# Incohérences GHM ↔ GHS
lines.append("\n## Incohérences GHM ↔ GHS détectées\n")
if incoherences:
lines.append("| Dossier | Côté | GHS extrait | GHS possibles pour le GHM |")
lines.append("|---|---|---|---|")
for inc in incoherences:
lines.append(f"| {inc['dossier']} | {inc['side']} | `{inc['ghs_extrait']}` | {inc['ghs_possibles']} |")
else:
lines.append("✓ Aucune incohérence détectée sur les GHM/GHS extraits.")
lines.append(f"\n## Synthèse\n")
total_codes = sum(st["valid"] + st["invalid"] for st in per_field.values())
total_valid = sum(st["valid"] for st in per_field.values())
lines.append(f"- **{total_valid}/{total_codes} codes valides** ({100*total_valid/total_codes:.1f}%)")
lines.append(f"- **{sugg_count} suggestions de correction OCR** trouvées automatiquement")
lines.append(f"- **{len(incoherences)} incohérences GHM↔GHS** sur les paires extraites")
REPORT.write_text("\n".join(lines), encoding="utf-8")
print(f"\nRapport → {REPORT}")
if __name__ == "__main__":
print("Annotation en place des JSONs V2 + calcul validation ATIH...\n")
results = annotate_all()
build_report(results)

597
pipeline/referentials.py Normal file
View File

@@ -0,0 +1,597 @@
"""Validation des codes médicaux contre les référentiels ATIH 2018.
Ce module charge les référentiels officiels ATIH (CIM-10, CCAM, GHM, table
GHM→GHS) dans une base SQLite locale et expose des fonctions de validation
pour les codes extraits par le pipeline OCR.
Sources téléchargées (voir `referentials/sources/`) :
- **CIM-10 FR 2019** au format ClaML XML (ATIH) — utilisée comme substitut
à la CIM-10 2018 : ATIH ne publie officiellement la CIM-10 2018 qu'en PDF.
L'écart entre CIM-10 2019 et CIM-10 2018 est < 100 codes sur ~11 600 ;
un écart acceptable pour une validation OCR (et qui peut introduire
quelques faux positifs pour des codes créés en 2019, mais jamais de faux
négatifs sur un code 2018 valide).
- **CCAM descriptive à usage PMSI 2018 V5** (XLSX ATIH).
- **GHM V2018** (XLSX ATIH, fichier `regroupement_ghm_v2018.xlsx`).
- **Arrêté tarifaire MCO Février 2018** (XLSX ATIH, feuilles "Tarifs public"
et "Tarifs privé") pour la table GHM→GHS.
Formats de codes supportés :
- CIM-10 : lettre + 2 à 5 chiffres (ex: K650, T814, sans point).
- CCAM : 4 lettres + 3 chiffres (ex: EBFA012).
- GHM : 2 chiffres + lettre + 3 chiffres (ex: 11M122).
- GHS : nombre 1-5 chiffres (ex: 4323).
Utilisation :
from pipeline.referentials import (
is_valid_cim10, is_valid_ccam, is_valid_ghm, is_valid_ghs,
nearest_cim10, ghm_to_ghs, get_cim10_libelle,
)
if not is_valid_cim10("K650"):
suggestion = nearest_cim10("K65O") # correction O → 0
Build initial de la base : ``python -m pipeline.referentials --build``
Test rapide : ``python -m pipeline.referentials --test``
"""
from __future__ import annotations
import argparse
import gzip
import json
import re
import sqlite3
import sys
import xml.etree.ElementTree as ET
from functools import lru_cache
from pathlib import Path
from typing import Iterable
try:
from rapidfuzz.distance import Levenshtein as _Lev
_HAS_RAPIDFUZZ = True
except ImportError: # pragma: no cover - fallback pur Python
_HAS_RAPIDFUZZ = False
_ROOT = Path(__file__).resolve().parent.parent
REFERENTIALS_DIR = _ROOT / "referentials"
SOURCES_DIR = REFERENTIALS_DIR / "sources"
DB_PATH = REFERENTIALS_DIR / "atih_2018.sqlite"
# Formats attendus (utilisés pour normaliser l'entrée avant recherche DB)
_RE_CIM10 = re.compile(r"^[A-Z][0-9]{2,5}$")
_RE_CCAM = re.compile(r"^[A-Z]{4}[0-9]{3}$")
_RE_GHM = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
_RE_GHS = re.compile(r"^[0-9]{1,5}$")
# ---------------------------------------------------------------------------
# Normalisation des entrées (tolérante aux bruits OCR courants)
# ---------------------------------------------------------------------------
def _normalize_cim10(code: str) -> str:
"""Normalise un code CIM-10 extrait pour comparaison au référentiel.
Gère :
- Point décimal optionnel : "K65.0""K650"
- Espaces / casse : " k650 ""K650"
- Suffixes PMSI : "C795 *""C795" (le `*` signifie "CMA exclue par le DP")
et "K635+0""K635" (le `+N` est une extension PMSI à valider séparément)
- Suffixe de position numérique éventuellement collé : "K650+""K650"
"""
if not code:
return ""
s = code.strip().upper()
# Couper à la première occurrence d'un marqueur PMSI non-alphanum
# (*, +, #, espace suivi d'un marqueur). On garde uniquement la tête du code.
for sep in ("*", "+", "#"):
if sep in s:
s = s.split(sep, 1)[0]
return s.replace(".", "").replace(" ", "").strip()
def _normalize_ccam(code: str) -> str:
if not code:
return ""
# Retire éventuelle extension PMSI (-1, -2…) et les espaces
base = code.split("-")[0]
return base.replace(" ", "").strip().upper()
def _normalize_ghm(code: str) -> str:
if not code:
return ""
return code.replace(" ", "").strip().upper()
def _normalize_ghs(code: str) -> str:
if not code:
return ""
# Les GHS peuvent arriver en "0023" ou "23"
s = re.sub(r"[^0-9]", "", code).lstrip("0")
return s or "0"
# ---------------------------------------------------------------------------
# Construction de la base SQLite depuis les sources téléchargées
# ---------------------------------------------------------------------------
def _create_schema(conn: sqlite3.Connection) -> None:
conn.executescript(
"""
DROP TABLE IF EXISTS cim10;
DROP TABLE IF EXISTS ccam;
DROP TABLE IF EXISTS ghm;
DROP TABLE IF EXISTS ghm_ghs;
DROP TABLE IF EXISTS metadata;
CREATE TABLE cim10 (
code TEXT PRIMARY KEY,
libelle TEXT
);
CREATE TABLE ccam (
code TEXT PRIMARY KEY,
libelle TEXT
);
CREATE TABLE ghm (
code TEXT PRIMARY KEY,
libelle TEXT,
aso TEXT,
da TEXT
);
CREATE TABLE ghm_ghs (
ghm TEXT,
ghs TEXT,
secteur TEXT, -- 'public' ou 'prive'
libelle TEXT,
tarif REAL,
PRIMARY KEY (ghm, ghs, secteur)
);
CREATE INDEX idx_ghm_ghs_ghm ON ghm_ghs(ghm);
CREATE INDEX idx_ghm_ghs_ghs ON ghm_ghs(ghs);
CREATE TABLE metadata (
key TEXT PRIMARY KEY,
value TEXT
);
"""
)
def _load_cim10(conn: sqlite3.Connection) -> int:
"""Charge la CIM-10 FR depuis le ClaML XML (catégories uniquement)."""
xml_path = SOURCES_DIR / "cim10_claml_2019_extracted" / "cim10_claml_2019.xml"
if not xml_path.exists():
# Fallback : chercher n'importe quel xml dans extracted
xmls = list((SOURCES_DIR / "cim10_claml_2019_extracted").glob("*.xml"))
if not xmls:
raise FileNotFoundError(
f"CIM-10 ClaML introuvable dans {SOURCES_DIR}. "
f"Assurez-vous d'avoir téléchargé et extrait le zip ATIH."
)
xml_path = xmls[0]
tree = ET.parse(xml_path)
root = tree.getroot()
rows: list[tuple[str, str]] = []
for cls in root.findall(".//Class"):
kind = cls.get("kind")
if kind != "category":
continue
raw_code = cls.get("code") or ""
code = raw_code.replace(".", "").upper().strip()
if not code:
continue
pref = cls.find('.//Rubric[@kind="preferred"]/Label')
libelle = pref.text.strip() if (pref is not None and pref.text) else ""
rows.append((code, libelle))
conn.executemany(
"INSERT OR REPLACE INTO cim10 (code, libelle) VALUES (?, ?)", rows
)
return len(rows)
def _load_ccam(conn: sqlite3.Connection) -> int:
"""Charge la CCAM 2018 depuis le XLSX ATIH (feuilles CCAM_Final_2018_*)."""
import openpyxl
xlsx_path = SOURCES_DIR / "ccam_2018_v5.xlsx"
if not xlsx_path.exists():
raise FileNotFoundError(f"CCAM XLSX introuvable : {xlsx_path}")
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
pat = re.compile(r"^[A-Z]{4}[0-9]{3}$")
seen: dict[str, str] = {}
for sheet_name in wb.sheetnames:
if not sheet_name.startswith("CCAM_Final_2018"):
continue
ws = wb[sheet_name]
cur_code: str | None = None
for row in ws.iter_rows(values_only=True):
# col 0 : parfois un code, col 3 : texte / libellé
col0 = row[0] if len(row) > 0 else None
col3 = row[3] if len(row) > 3 else None
if isinstance(col0, str):
c = col0.strip()
if pat.match(c):
cur_code = c
if c not in seen:
seen[c] = ""
if cur_code and isinstance(col3, str) and col3.strip():
if not seen.get(cur_code):
seen[cur_code] = col3.strip()[:500]
rows = list(seen.items())
conn.executemany(
"INSERT OR REPLACE INTO ccam (code, libelle) VALUES (?, ?)", rows
)
return len(rows)
def _load_ghm(conn: sqlite3.Connection) -> int:
"""Charge les GHM V2018 depuis regroupement_ghm_v2018.xlsx."""
import openpyxl
xlsx_path = SOURCES_DIR / "regroupement_ghm_v2018.xlsx"
if not xlsx_path.exists():
raise FileNotFoundError(f"GHM XLSX introuvable : {xlsx_path}")
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
ws = wb[wb.sheetnames[0]]
ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
rows: list[tuple[str, str, str, str]] = []
header_found = False
for row in ws.iter_rows(values_only=True):
if not header_found:
if row and row[0] == "GHM":
header_found = True
continue
code = row[0]
if not isinstance(code, str):
continue
code = code.strip().upper()
if not ghm_pat.match(code):
continue
libelle = (row[1] or "").strip() if isinstance(row[1], str) else ""
aso = (row[2] or "").strip() if isinstance(row[2], str) else ""
da = (row[3] or "").strip() if isinstance(row[3], str) else ""
rows.append((code, libelle, aso, da))
conn.executemany(
"INSERT OR REPLACE INTO ghm (code, libelle, aso, da) VALUES (?, ?, ?, ?)",
rows,
)
return len(rows)
def _load_ghm_ghs(conn: sqlite3.Connection) -> int:
"""Charge la table GHM→GHS depuis tarif_arrete_fev_2018.xlsx.
Feuilles "Tarifs public" (secteur='public') et "Tarifs privé"
(secteur='prive'). Chaque ligne = un couple (GHS, GHM, libellé, tarif).
"""
import openpyxl
xlsx_path = SOURCES_DIR / "tarif_arrete_fev_2018.xlsx"
if not xlsx_path.exists():
raise FileNotFoundError(f"Tarifs XLSX introuvable : {xlsx_path}")
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
all_rows: list[tuple[str, str, str, str, float | None]] = []
for sheet_name, secteur in [("Tarifs public", "public"), ("Tarifs privé", "prive")]:
if sheet_name not in wb.sheetnames:
continue
ws = wb[sheet_name]
header_found = False
for row in ws.iter_rows(values_only=True):
if not header_found:
if row and isinstance(row[0], str) and row[0].strip().upper() == "GHS":
header_found = True
continue
ghs_raw = row[0]
ghm_raw = row[1] if len(row) > 1 else None
lib_raw = row[2] if len(row) > 2 else None
tarif_raw = row[5] if len(row) > 5 else None
if ghs_raw is None or ghm_raw is None:
continue
try:
ghs = str(int(float(ghs_raw)))
except (ValueError, TypeError):
continue
ghm = str(ghm_raw).strip().upper()
if not ghm_pat.match(ghm):
continue
libelle = str(lib_raw).strip() if lib_raw else ""
try:
tarif = float(tarif_raw) if tarif_raw is not None else None
except (ValueError, TypeError):
tarif = None
all_rows.append((ghm, ghs, secteur, libelle, tarif))
conn.executemany(
"INSERT OR REPLACE INTO ghm_ghs (ghm, ghs, secteur, libelle, tarif) "
"VALUES (?, ?, ?, ?, ?)",
all_rows,
)
return len(all_rows)
def build_database(db_path: Path = DB_PATH, verbose: bool = True) -> dict[str, int]:
"""Construit la base SQLite à partir des sources.
Retourne les counts par table. Idempotent : DROP + CREATE + INSERT.
"""
REFERENTIALS_DIR.mkdir(parents=True, exist_ok=True)
conn = sqlite3.connect(db_path)
try:
_create_schema(conn)
n_cim10 = _load_cim10(conn)
if verbose:
print(f" CIM-10 : {n_cim10} codes chargés")
n_ccam = _load_ccam(conn)
if verbose:
print(f" CCAM : {n_ccam} codes chargés")
n_ghm = _load_ghm(conn)
if verbose:
print(f" GHM : {n_ghm} codes chargés")
n_ghs = _load_ghm_ghs(conn)
if verbose:
print(f" GHM→GHS : {n_ghs} lignes (public+privé)")
conn.executemany(
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
[
("source_cim10", "ATIH CIM-10 FR 2019 ClaML (substitut 2018)"),
("source_ccam", "ATIH CCAM descriptive à usage PMSI 2018 V5"),
("source_ghm", "ATIH regroupement_ghm_v2018.xlsx"),
("source_ghm_ghs", "ATIH tarif_arrete_fev_2018.xlsx"),
("n_cim10", str(n_cim10)),
("n_ccam", str(n_ccam)),
("n_ghm", str(n_ghm)),
("n_ghm_ghs", str(n_ghs)),
],
)
conn.commit()
return {
"cim10": n_cim10,
"ccam": n_ccam,
"ghm": n_ghm,
"ghm_ghs": n_ghs,
}
finally:
conn.close()
# ---------------------------------------------------------------------------
# Accès à la base (connexion cachée au niveau du module)
# ---------------------------------------------------------------------------
_CONN: sqlite3.Connection | None = None
def _get_conn() -> sqlite3.Connection:
global _CONN
if _CONN is not None:
return _CONN
if not DB_PATH.exists():
raise FileNotFoundError(
f"Base SQLite introuvable : {DB_PATH}. "
"Lancez d'abord : python -m pipeline.referentials --build"
)
_CONN = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, check_same_thread=False)
return _CONN
# ---------------------------------------------------------------------------
# API publique de validation
# ---------------------------------------------------------------------------
@lru_cache(maxsize=8192)
def is_valid_cim10(code: str) -> bool:
"""Vérifie qu'un code CIM-10 existe dans le référentiel 2018 (substitut 2019)."""
norm = _normalize_cim10(code)
if not norm or not _RE_CIM10.match(norm):
return False
cur = _get_conn().execute("SELECT 1 FROM cim10 WHERE code = ? LIMIT 1", (norm,))
return cur.fetchone() is not None
@lru_cache(maxsize=8192)
def is_valid_ccam(code: str) -> bool:
"""Vérifie qu'un code CCAM existe dans la CCAM PMSI 2018."""
norm = _normalize_ccam(code)
if not norm or not _RE_CCAM.match(norm):
return False
cur = _get_conn().execute("SELECT 1 FROM ccam WHERE code = ? LIMIT 1", (norm,))
return cur.fetchone() is not None
@lru_cache(maxsize=8192)
def is_valid_ghm(code: str) -> bool:
"""Vérifie qu'un code GHM existe dans la V2018."""
norm = _normalize_ghm(code)
if not norm or not _RE_GHM.match(norm):
return False
cur = _get_conn().execute("SELECT 1 FROM ghm WHERE code = ? LIMIT 1", (norm,))
return cur.fetchone() is not None
@lru_cache(maxsize=8192)
def is_valid_ghs(code: str) -> bool:
"""Vérifie qu'un code GHS existe dans l'arrêté tarifaire 2018."""
norm = _normalize_ghs(code)
if not norm or not _RE_GHS.match(norm):
return False
cur = _get_conn().execute(
"SELECT 1 FROM ghm_ghs WHERE ghs = ? LIMIT 1", (norm,)
)
return cur.fetchone() is not None
@lru_cache(maxsize=4096)
def get_cim10_libelle(code: str) -> str | None:
"""Renvoie le libellé officiel du code CIM-10, ou None."""
norm = _normalize_cim10(code)
if not norm:
return None
cur = _get_conn().execute(
"SELECT libelle FROM cim10 WHERE code = ? LIMIT 1", (norm,)
)
row = cur.fetchone()
return row[0] if row else None
def ghm_to_ghs(ghm: str) -> list[str]:
"""Renvoie les GHS possibles pour un GHM donné (publics et privés fusionnés).
Utilisé pour vérifier la cohérence du couple (GHM, GHS) extrait.
"""
norm = _normalize_ghm(ghm)
if not norm:
return []
cur = _get_conn().execute(
"SELECT DISTINCT ghs FROM ghm_ghs WHERE ghm = ?", (norm,)
)
return [r[0] for r in cur.fetchall()]
def _levenshtein(a: str, b: str) -> int:
if _HAS_RAPIDFUZZ:
return _Lev.distance(a, b)
# Fallback pur Python (O(n*m)) — suffisant pour des codes courts
if len(a) < len(b):
a, b = b, a
if not b:
return len(a)
prev = list(range(len(b) + 1))
for i, ca in enumerate(a, 1):
cur = [i]
for j, cb in enumerate(b, 1):
ins = cur[j - 1] + 1
dele = prev[j] + 1
sub = prev[j - 1] + (ca != cb)
cur.append(min(ins, dele, sub))
prev = cur
return prev[-1]
def nearest_cim10(code: str, max_distance: int = 1) -> str | None:
"""Trouve le code CIM-10 valide le plus proche (distance de Levenshtein).
Utile pour corriger les erreurs OCR courantes (O/0, I/1, B/8…).
Stratégie de départage en cas d'égalité de distance :
1. Privilégie un candidat de même longueur (substitution >> suppression)
2. Sinon tri lexicographique croissant.
Retourne None si aucun code n'est à ≤ max_distance.
"""
norm = _normalize_cim10(code)
if not norm:
return None
if is_valid_cim10(norm):
return norm
conn = _get_conn()
length = len(norm)
cur = conn.execute(
"SELECT code FROM cim10 WHERE length(code) BETWEEN ? AND ?",
(length - max_distance, length + max_distance),
)
candidates: list[tuple[int, int, str]] = [] # (distance, |len_diff|, code)
for (cand,) in cur:
d = _levenshtein(norm, cand)
if d <= max_distance:
candidates.append((d, abs(len(cand) - length), cand))
if not candidates:
return None
# Tri : distance min, puis longueur la plus proche, puis lexicographique
candidates.sort(key=lambda t: (t[0], t[1], t[2]))
return candidates[0][2]
# ---------------------------------------------------------------------------
# Tests légers (exécutables sans pytest)
# ---------------------------------------------------------------------------
def _run_selftest() -> int:
"""Tests de fumée rapides. Retourne le nombre d'échecs."""
failures = 0
def check(label: str, cond: bool, detail: str = "") -> None:
nonlocal failures
status = "OK " if cond else "FAIL"
print(f" [{status}] {label}{('' + detail) if detail else ''}")
if not cond:
failures += 1
print("=== Tests référentiels ATIH 2018 ===")
# CIM-10
check("CIM-10 K650 valide (péritonite)", is_valid_cim10("K650"))
check("CIM-10 K65.0 (avec point) valide", is_valid_cim10("K65.0"))
check("CIM-10 T814 valide", is_valid_cim10("T814"))
check("CIM-10 ZZZ99 invalide", not is_valid_cim10("ZZZ99"))
check("CIM-10 libellé K650", get_cim10_libelle("K650") is not None,
detail=str(get_cim10_libelle("K650")))
# Correction OCR : K65O (lettre O) → K650
suggestion = nearest_cim10("K65O")
check("CIM-10 nearest(K65O) = K650", suggestion == "K650",
detail=f"got={suggestion}")
# CCAM
check("CCAM EBFA012 valide", is_valid_ccam("EBFA012"))
check("CCAM EBFA012-1 (ext PMSI) valide", is_valid_ccam("EBFA012-1"))
check("CCAM AAAA000 invalide", not is_valid_ccam("AAAA000"))
# GHM
check("GHM 01C031 valide", is_valid_ghm("01C031"))
check("GHM 99Z99Z invalide", not is_valid_ghm("99Z99Z"))
# GHS
check("GHS 22 valide", is_valid_ghs("22"))
check("GHS 99999 invalide", not is_valid_ghs("99999"))
# GHM→GHS
ghs_list = ghm_to_ghs("01C031")
check("GHM 01C031 → GHS inclut 22", "22" in ghs_list,
detail=f"ghs_list={ghs_list}")
# Format invalide (robustesse)
check("is_valid_cim10('') = False", not is_valid_cim10(""))
check("is_valid_ccam(None cast) = False", not is_valid_ccam(""))
print(f"=== Résultat : {failures} échec(s) ===")
return failures
def _cli() -> int:
parser = argparse.ArgumentParser(description="Référentiels ATIH 2018")
g = parser.add_mutually_exclusive_group(required=True)
g.add_argument("--build", action="store_true",
help="(Re)construit la base SQLite depuis referentials/sources/")
g.add_argument("--test", action="store_true",
help="Exécute les tests de fumée")
g.add_argument("--stats", action="store_true",
help="Affiche les comptages de la base")
args = parser.parse_args()
if args.build:
print(f"Construction de {DB_PATH} depuis {SOURCES_DIR}...")
counts = build_database()
print("OK :", counts)
return 0
if args.test:
return 1 if _run_selftest() > 0 else 0
if args.stats:
conn = _get_conn()
for tbl in ("cim10", "ccam", "ghm", "ghm_ghs"):
n = conn.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0]
print(f" {tbl:10s}: {n}")
print("Metadata :")
for k, v in conn.execute("SELECT key, value FROM metadata"):
print(f" {k}: {v}")
return 0
return 0
if __name__ == "__main__":
sys.exit(_cli())

217
pipeline/validation.py Normal file
View File

@@ -0,0 +1,217 @@
"""Validation ATIH des codes extraits.
Prend un JSON d'extraction produit par `pipeline/extract.py` et l'enrichit
d'une section `_validation` par champ de code médical (CIM-10, CCAM, GHM, GHS)
avec :
- `valid` : le code existe dans le référentiel ATIH 2018
- `suggestion` : si invalide, code le plus proche par Levenshtein ≤ 1 (CIM-10)
- `libelle_ref` : libellé officiel ATIH (CIM-10) pour audit
Plus des cross-checks (GHS ∈ ghm_to_ghs(GHM)) pour détecter des incohérences
de groupage.
Principes :
- Lecture seule sur le JSON source — on produit une COPIE enrichie.
- Ne supprime / ne corrige RIEN automatiquement ; seule une suggestion est
annotée. La correction reste à la discrétion d'un humain (overlay) ou d'un
prochain pass automatique.
"""
from __future__ import annotations
from copy import deepcopy
from typing import Any
from .referentials import (
get_cim10_libelle,
ghm_to_ghs,
is_valid_ccam,
is_valid_cim10,
is_valid_ghm,
is_valid_ghs,
nearest_cim10,
)
# ============================================================
# Helpers
# ============================================================
def _check_cim10(code: str) -> dict:
"""Valide un code CIM-10 et suggère une correction si invalide."""
code = (code or "").strip()
if not code:
return {"code": "", "valid": None}
valid = is_valid_cim10(code)
entry = {"code": code, "valid": valid}
if valid:
entry["libelle_ref"] = get_cim10_libelle(code)
else:
sug = nearest_cim10(code, max_distance=1)
if sug:
entry["suggestion"] = sug
entry["suggestion_libelle"] = get_cim10_libelle(sug)
return entry
def _check_ccam(code: str) -> dict:
code = (code or "").strip()
if not code:
return {"code": "", "valid": None}
return {"code": code, "valid": is_valid_ccam(code)}
def _check_ghm(code: str) -> dict:
code = (code or "").strip()
if not code:
return {"code": "", "valid": None}
entry = {"code": code, "valid": is_valid_ghm(code)}
if entry["valid"]:
entry["ghs_possibles"] = ghm_to_ghs(code)
return entry
def _check_ghs(code: str) -> dict:
code = (code or "").strip()
if not code:
return {"code": "", "valid": None}
return {"code": code, "valid": is_valid_ghs(code)}
# ============================================================
# Validation d'un bloc codage (etab ou reco)
# ============================================================
def _validate_codage(codage: dict) -> dict:
"""Valide un bloc codage_etab ou codage_reco."""
if not isinstance(codage, dict):
return {}
out = {
"dp": _check_cim10(codage.get("dp", "")),
"dr": _check_cim10(codage.get("dr", "")),
}
das_list = codage.get("das") or []
if isinstance(das_list, list):
out["das"] = [_check_cim10(d.get("code", "")) if isinstance(d, dict) else _check_cim10(str(d))
for d in das_list]
return out
def _validate_actes(actes: Any) -> list[dict]:
if not isinstance(actes, list):
return []
return [_check_ccam(a.get("code", "")) if isinstance(a, dict) else _check_ccam(str(a))
for a in actes]
# ============================================================
# Cross-checks GHM ↔ GHS
# ============================================================
def _cross_check_ghm_ghs(ghm: str, ghs: str) -> dict:
"""Vérifie qu'un GHS observé est listé parmi les GHS possibles du GHM."""
ghm = (ghm or "").strip()
ghs = (ghs or "").strip()
if not ghm or not ghs:
return {"checked": False, "reason": "ghm ou ghs manquant"}
if not is_valid_ghm(ghm):
return {"checked": False, "reason": "GHM invalide"}
possibles = ghm_to_ghs(ghm)
# Normalisation simple : on compare la fin (au cas où l'un est tronqué)
ok = ghs in possibles or any(p.endswith(ghs) or ghs.endswith(p) for p in possibles)
return {
"checked": True,
"coherent": ok,
"ghs_extrait": ghs,
"ghs_possibles": possibles,
}
# ============================================================
# Point d'entrée
# ============================================================
def validate_recueil(recueil: dict) -> dict:
"""Retourne un dict résumé des validations pour la page recueil."""
v = {
"codage_etab": _validate_codage(recueil.get("codage_etab", {})),
"codage_reco": _validate_codage(recueil.get("codage_reco", {})),
"actes_etab": _validate_actes(recueil.get("actes_etab", [])),
"actes_reco": _validate_actes(recueil.get("actes_reco", [])),
"ghm_etab": _check_ghm(recueil.get("ghm_etab", "")),
"ghs_etab": _check_ghs(recueil.get("ghs_etab", "")),
"ghm_reco": _check_ghm(recueil.get("ghm_reco", "")),
"ghs_reco": _check_ghs(recueil.get("ghs_reco", "")),
"cross_checks": {
"etab": _cross_check_ghm_ghs(
recueil.get("ghm_etab", ""), recueil.get("ghs_etab", "")),
"reco": _cross_check_ghm_ghs(
recueil.get("ghm_reco", ""), recueil.get("ghs_reco", "")),
},
}
v["summary"] = _summarize(v)
return v
def _summarize(validation: dict) -> dict:
"""Compte les codes valides / invalides dans une section _validation."""
valid, invalid, empty = 0, 0, 0
def _count_entry(e):
nonlocal valid, invalid, empty
if e.get("valid") is True: valid += 1
elif e.get("valid") is False: invalid += 1
else: empty += 1
for section in ("codage_etab", "codage_reco"):
sec = validation.get(section, {}) or {}
_count_entry(sec.get("dp", {}))
_count_entry(sec.get("dr", {}))
for d in sec.get("das", []) or []:
_count_entry(d)
for actes_key in ("actes_etab", "actes_reco"):
for a in validation.get(actes_key, []) or []:
_count_entry(a)
for g in ("ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"):
_count_entry(validation.get(g, {}))
cc = validation.get("cross_checks", {})
incoherent = sum(1 for v in cc.values() if v.get("checked") and not v.get("coherent"))
return {
"valid": valid, "invalid": invalid, "empty": empty,
"total_codes": valid + invalid,
"ghm_ghs_incoherents": incoherent,
}
def annotate(extraction: dict) -> dict:
"""Annote un JSON d'extraction complet avec validation ATIH.
Retourne une COPIE enrichie d'un bloc `_validation` à la racine de chaque
page structurée. N'efface / ne corrige aucune valeur.
"""
out = deepcopy(extraction)
ext = out.get("extraction") or {}
if "recueil" in ext and isinstance(ext["recueil"], dict):
ext["recueil"]["_validation"] = validate_recueil(ext["recueil"])
# Concertation 2 : valider les 3 GHS
if "concertation_2" in ext and isinstance(ext["concertation_2"], dict):
c2 = ext["concertation_2"]
c2["_validation"] = {
"ghs_initial": _check_ghs(c2.get("ghs_initial", "")),
"ghs_avant_concertation": _check_ghs(c2.get("ghs_avant_concertation", "")),
"ghs_final": _check_ghs(c2.get("ghs_final", "")),
}
return out
if __name__ == "__main__":
# Test rapide sur OGC 7
import json, sys
path = sys.argv[1] if len(sys.argv) > 1 else "output/v2/OGC 7.json"
with open(path) as f:
data = json.load(f)
annotated = annotate(data)
rec_v = annotated["extraction"]["recueil"]["_validation"]
print(json.dumps(rec_v["summary"], indent=2))
print("\ncross_checks:", json.dumps(rec_v["cross_checks"], indent=2, ensure_ascii=False))

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

View File

@@ -0,0 +1,283 @@
<?xml version="1.0" encoding="UTF-8"?>
<!ENTITY % rubric.simple "#PCDATA | Reference | Term">
<!ENTITY % rubric.complex "%rubric.simple; | Para | Include |
IncludeDescendants| Fragment | List | Table">
<!ELEMENT ClaML (
Meta*,
Identifier*,
Title,
Authors?,
Variants?,
ClassKinds,
UsageKinds?,
RubricKinds,
Modifier*,
ModifierClass*,
Class*)
>
<!ATTLIST ClaML
version CDATA #REQUIRED
>
<!ELEMENT Variants (Variant+)>
<!ELEMENT Variant (#PCDATA)>
<!ATTLIST Variant
name ID #REQUIRED
>
<!ELEMENT Meta EMPTY>
<!ATTLIST Meta
name CDATA #REQUIRED
value CDATA #REQUIRED
variants IDREFS #IMPLIED
>
<!ELEMENT Identifier EMPTY>
<!ATTLIST Identifier
authority NMTOKEN #IMPLIED
uid CDATA #REQUIRED
>
<!ELEMENT Title (#PCDATA)>
<!ATTLIST Title
name NMTOKEN #REQUIRED
version CDATA #IMPLIED
date CDATA #IMPLIED
>
<!ELEMENT Authors (Author* )>
<!ELEMENT Author (#PCDATA)>
<!ATTLIST Author
name ID #REQUIRED
>
<!ELEMENT ClassKinds (ClassKind+)>
<!ELEMENT RubricKinds (RubricKind+)>
<!ELEMENT UsageKinds (UsageKind+)>
<!ELEMENT ClassKind (Display*)>
<!ATTLIST ClassKind
name ID #REQUIRED
>
<!ELEMENT RubricKind (Display*)>
<!ATTLIST RubricKind
name ID #REQUIRED
inherited (true|false) "true"
>
<!ELEMENT UsageKind EMPTY>
<!ATTLIST UsageKind
name ID #REQUIRED
mark CDATA #REQUIRED
>
<!ELEMENT Display (#PCDATA)>
<!ATTLIST Display
xml:lang NMTOKEN #REQUIRED
variants IDREF #IMPLIED
>
<!ELEMENT Modifier (
Meta*,
SubClass*,
Rubric*,
History*)
>
<!ATTLIST Modifier
code NMTOKEN #REQUIRED
variants IDREFS #IMPLIED
>
<!ELEMENT ModifierClass (
Meta*,
SuperClass,
SubClass*,
Rubric*,
History*)
>
<!ATTLIST ModifierClass
modifier NMTOKEN #REQUIRED
code NMTOKEN #REQUIRED
usage IDREF #IMPLIED
variants IDREFS #IMPLIED
>
<!ELEMENT Class (
Meta*,
SuperClass*,
SubClass*,
ModifiedBy*,
ExcludeModifier*,
Rubric*,
History*)
>
<!ATTLIST Class
code CDATA #REQUIRED
kind IDREF #REQUIRED
usage IDREF #IMPLIED
variants IDREFS #IMPLIED
>
<!ELEMENT ModifiedBy (
Meta*,
ValidModifierClass*)
>
<!ATTLIST ModifiedBy
code NMTOKEN #REQUIRED
all (true|false) "true"
position CDATA #IMPLIED
variants IDREFS #IMPLIED
>
<!ELEMENT ExcludeModifier EMPTY>
<!ATTLIST ExcludeModifier
code NMTOKEN #REQUIRED
variants IDREFS #IMPLIED
>
<!ELEMENT ValidModifierClass EMPTY>
<!ATTLIST ValidModifierClass
code NMTOKEN #REQUIRED
variants IDREFS #IMPLIED
>
<!ELEMENT Rubric (
Label+,
History*)
>
<!ATTLIST Rubric
id ID #IMPLIED
kind IDREF #REQUIRED
usage IDREF #IMPLIED
>
<!ELEMENT Label (%rubric.complex;)*>
<!ATTLIST Label
xml:lang NMTOKEN #REQUIRED
xml:space (default|preserve) "default"
variants IDREFS #IMPLIED
>
<!ELEMENT History (#PCDATA)>
<!ATTLIST History
author IDREF #REQUIRED
date NMTOKEN #REQUIRED
>
<!ELEMENT SuperClass EMPTY>
<!ATTLIST SuperClass
code CDATA #REQUIRED
variants IDREFS #IMPLIED
>
<!ELEMENT SubClass EMPTY>
<!ATTLIST SubClass
code CDATA #REQUIRED
variants IDREFS #IMPLIED
>
<!ELEMENT Reference (#PCDATA)>
<!ATTLIST Reference
class CDATA #IMPLIED
authority NMTOKEN #IMPLIED
uid NMTOKEN #IMPLIED
code CDATA #IMPLIED
usage IDREF #IMPLIED
variants IDREFS #IMPLIED
>
<!ELEMENT Para (%rubric.simple;)*>
<!ATTLIST Para
class CDATA #IMPLIED
>
<!ELEMENT Fragment (%rubric.simple;)*>
<!ATTLIST Fragment
class CDATA #IMPLIED
usage IDREF #IMPLIED
type (item | list) "item"
>
<!ELEMENT Include EMPTY>
<!ATTLIST Include
class CDATA #IMPLIED
rubric IDREF #REQUIRED
>
<!ELEMENT IncludeDescendants EMPTY>
<!ATTLIST IncludeDescendants
code NMTOKEN #REQUIRED
kind IDREF #REQUIRED
>
<!ELEMENT List (ListItem+)>
<!ATTLIST List
class CDATA #IMPLIED
>
<!ELEMENT ListItem (
%rubric.simple;
| Para
| Include
| List
| Table)*
>
<!ATTLIST ListItem
class CDATA #IMPLIED
>
<!ELEMENT Table (
Caption?,
THead?,
TBody?,
TFoot?)
>
<!ATTLIST Table
class CDATA #IMPLIED
>
<!ELEMENT Caption (%rubric.simple;)*>
<!ATTLIST Caption
class CDATA #IMPLIED
>
<!ELEMENT THead (Row+)>
<!ATTLIST THead
class CDATA #IMPLIED
>
<!ELEMENT TBody (Row+)>
<!ATTLIST TBody
class CDATA #IMPLIED
>
<!ELEMENT TFoot (Row+)>
<!ATTLIST TFoot
class CDATA #IMPLIED
>
<!ELEMENT Row (Cell*)>
<!ATTLIST Row
class CDATA #IMPLIED
>
<!ELEMENT Cell (
%rubric.simple;
| Para
| Include
| List
| Table)*
>
<!ATTLIST Cell
class CDATA #IMPLIED
rowspan CDATA #IMPLIED
colspan CDATA #IMPLIED
>
<!ELEMENT Term (#PCDATA)>
<!ATTLIST Term
class CDATA #IMPLIED
>

File diff suppressed because it is too large Load Diff

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

Binary file not shown.

0
tests/__init__.py Normal file
View File

160
tests/test_referentials.py Normal file
View File

@@ -0,0 +1,160 @@
"""Tests unitaires du module pipeline.referentials.
Compatible pytest ET exécution directe (`python tests/test_referentials.py`).
Nécessite que la base SQLite ait déjà été construite :
python -m pipeline.referentials --build
"""
from __future__ import annotations
import sys
from pathlib import Path
# Permet l'exécution directe depuis tests/ sans installer le package.
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
from pipeline.referentials import ( # noqa: E402
DB_PATH,
ghm_to_ghs,
get_cim10_libelle,
is_valid_ccam,
is_valid_cim10,
is_valid_ghm,
is_valid_ghs,
nearest_cim10,
)
# ---------------------------------------------------------------------------
# CIM-10
# ---------------------------------------------------------------------------
def test_cim10_codes_valides():
"""Quelques codes courants du référentiel PMSI."""
assert is_valid_cim10("K650") # Péritonite aigüe
assert is_valid_cim10("T814") # Infection après acte
assert is_valid_cim10("I10") # Hypertension essentielle
assert is_valid_cim10("Z515") # Soins palliatifs
assert is_valid_cim10("C509") # Tumeur maligne du sein
def test_cim10_normalisation():
"""Tolère le point décimal (K65.0) et la casse (k650)."""
assert is_valid_cim10("K65.0")
assert is_valid_cim10("k650")
assert is_valid_cim10(" K650 ")
def test_cim10_codes_invalides():
assert not is_valid_cim10("")
assert not is_valid_cim10("ZZZ99")
assert not is_valid_cim10("K6501234") # trop long
assert not is_valid_cim10("1234") # ne commence pas par lettre
assert not is_valid_cim10("K65X") # suffixe non numérique
def test_cim10_libelle():
lib = get_cim10_libelle("K650")
assert lib is not None and "éritonit" in lib.lower() or "peritonit" in lib.lower()
def test_cim10_nearest_correction_ocr():
# O (lettre) lu au lieu de 0 (chiffre)
assert nearest_cim10("K65O") == "K650"
# Code déjà valide : renvoyé tel quel
assert nearest_cim10("K650") == "K650"
# Aucune correspondance à distance <= 1
assert nearest_cim10("ZZZZZ", max_distance=1) is None
# ---------------------------------------------------------------------------
# CCAM
# ---------------------------------------------------------------------------
def test_ccam_codes_valides():
assert is_valid_ccam("EBFA012")
assert is_valid_ccam("HFCC003") # Bypass gastrique (mentionné dans la V5)
assert is_valid_ccam("ebfa012") # casse insensible
assert is_valid_ccam("EBFA012-1") # extension PMSI tolérée
def test_ccam_codes_invalides():
assert not is_valid_ccam("AAAA000")
assert not is_valid_ccam("")
assert not is_valid_ccam("EBF012") # 3 lettres au lieu de 4
# ---------------------------------------------------------------------------
# GHM
# ---------------------------------------------------------------------------
def test_ghm_codes_valides():
assert is_valid_ghm("01C031")
assert is_valid_ghm("01c031")
def test_ghm_codes_invalides():
assert not is_valid_ghm("99Z99Z")
assert not is_valid_ghm("")
assert not is_valid_ghm("ABCDEF")
# ---------------------------------------------------------------------------
# GHS et couplage GHM→GHS
# ---------------------------------------------------------------------------
def test_ghs_valide():
assert is_valid_ghs("22")
assert is_valid_ghs("0022") # zéros de tête tolérés
assert not is_valid_ghs("99999")
assert not is_valid_ghs("")
def test_ghm_to_ghs():
ghs = ghm_to_ghs("01C031")
assert "22" in ghs
# GHM inexistant → liste vide
assert ghm_to_ghs("99Z99Z") == []
# ---------------------------------------------------------------------------
# Exécution directe (sans pytest)
# ---------------------------------------------------------------------------
def _main() -> int:
import traceback
tests = [
("test_cim10_codes_valides", test_cim10_codes_valides),
("test_cim10_normalisation", test_cim10_normalisation),
("test_cim10_codes_invalides", test_cim10_codes_invalides),
("test_cim10_libelle", test_cim10_libelle),
("test_cim10_nearest_correction_ocr", test_cim10_nearest_correction_ocr),
("test_ccam_codes_valides", test_ccam_codes_valides),
("test_ccam_codes_invalides", test_ccam_codes_invalides),
("test_ghm_codes_valides", test_ghm_codes_valides),
("test_ghm_codes_invalides", test_ghm_codes_invalides),
("test_ghs_valide", test_ghs_valide),
("test_ghm_to_ghs", test_ghm_to_ghs),
]
if not DB_PATH.exists():
print(f"ERREUR : base SQLite manquante ({DB_PATH}).")
print("Exécute d'abord : python -m pipeline.referentials --build")
return 2
failures = 0
for name, fn in tests:
try:
fn()
print(f" [OK ] {name}")
except AssertionError as e:
print(f" [FAIL] {name}{e}")
failures += 1
except Exception:
print(f" [ERR] {name}")
traceback.print_exc()
failures += 1
print(f"=== {len(tests) - failures}/{len(tests)} tests OK ===")
return 0 if failures == 0 else 1
if __name__ == "__main__":
sys.exit(_main())