feat(referentials): validation ATIH 2018 des codes médicaux
Ajoute une couche de validation post-extraction contre les référentiels
officiels de l'ATIH (Agence Technique de l'Information sur
l'Hospitalisation) pour 2018. Zéro tolérance sur les codes T2A : un
code invalide est flaggé, et une correction par plus proche voisin
(Levenshtein ≤ 1) est proposée.
Contenu :
- pipeline/referentials.py : API publique is_valid_{cim10,ccam,ghm,ghs},
get_cim10_libelle, nearest_cim10, ghm_to_ghs. CLI --build/--test/--stats.
- pipeline/validation.py : annote un JSON d'extraction avec un bloc
`_validation` par page (codes valides/invalides + suggestions + cross-
checks GHM↔GHS).
- referentials/sources/ : données brutes ATIH publiques (CIM-10 ClaML
2019 substitut, CCAM v5 2018, GHM v2018, tarifs fév. 2018).
- referentials/atih_2018.sqlite : base SQLite prête à l'emploi
(11 623 CIM-10 · 8 147 CCAM · 2 593 GHM · 5 329 couples GHM→GHS).
- tests/test_referentials.py : 11 tests unitaires (11/11 passent).
- annotate_validation.py : script qui annote tous les JSONs V2 en
place et produit validation_report.md.
Note CIM-10 : la version 2018 ATIH n'est publiée qu'en PDF, ClaML 2019
est utilisée en substitut (écart connu ≈ 60 codes / 11 600).
Gestion des suffixes PMSI : `*` (CMA exclue par le DP) et `+N`
(extension PMSI) sont strippés avant validation, le code racine seul
est comparé au référentiel.
Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
This commit is contained in:
136
annotate_validation.py
Normal file
136
annotate_validation.py
Normal file
@@ -0,0 +1,136 @@
|
||||
"""Annote les JSONs V2 existants avec la validation ATIH.
|
||||
|
||||
Utile pour ajouter la validation sans relancer l'extraction complète.
|
||||
Produit aussi un rapport agrégé en markdown.
|
||||
"""
|
||||
import json
|
||||
from collections import defaultdict
|
||||
from pathlib import Path
|
||||
from pipeline.validation import annotate
|
||||
|
||||
|
||||
OUT_DIR = Path("output/v2")
|
||||
REPORT = Path("validation_report.md")
|
||||
|
||||
|
||||
def annotate_all() -> list[dict]:
|
||||
"""Annote chaque JSON et écrit le résultat en place (avec _validation)."""
|
||||
results = []
|
||||
for p in sorted(OUT_DIR.glob("OGC *.json")):
|
||||
data = json.loads(p.read_text(encoding="utf-8"))
|
||||
annotated = annotate(data)
|
||||
p.write_text(json.dumps(annotated, ensure_ascii=False, indent=2), encoding="utf-8")
|
||||
results.append(annotated)
|
||||
rec_v = annotated.get("extraction", {}).get("recueil", {}).get("_validation", {})
|
||||
s = rec_v.get("summary", {})
|
||||
cc = rec_v.get("cross_checks", {})
|
||||
print(f" {data['fichier']:8s} — valid={s.get('valid',0):2d} invalid={s.get('invalid',0):2d} "
|
||||
f"empty={s.get('empty',0):2d} incoherent={s.get('ghm_ghs_incoherents',0)} "
|
||||
f"etab={cc.get('etab',{}).get('coherent','?')} reco={cc.get('reco',{}).get('coherent','?')}")
|
||||
return results
|
||||
|
||||
|
||||
def build_report(results: list[dict]):
|
||||
"""Agrégation par champ : taux de validité, suggestions les plus fréquentes."""
|
||||
per_field = defaultdict(lambda: {"total": 0, "valid": 0, "invalid": 0, "empty": 0, "suggestions": []})
|
||||
incoherences = []
|
||||
|
||||
for d in results:
|
||||
name = d["fichier"]
|
||||
rec_v = d.get("extraction", {}).get("recueil", {}).get("_validation", {})
|
||||
if not rec_v:
|
||||
continue
|
||||
|
||||
# Codes unitaires
|
||||
for key in ["ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"]:
|
||||
entry = rec_v.get(key, {})
|
||||
st = per_field[key]
|
||||
st["total"] += 1
|
||||
if entry.get("valid") is True: st["valid"] += 1
|
||||
elif entry.get("valid") is False:
|
||||
st["invalid"] += 1
|
||||
if "suggestion" in entry:
|
||||
st["suggestions"].append((name, entry["code"], entry["suggestion"]))
|
||||
else: st["empty"] += 1
|
||||
|
||||
# Codage etab / reco : dp + dr + das
|
||||
for section in ["codage_etab", "codage_reco"]:
|
||||
sec = rec_v.get(section, {})
|
||||
for sub in ["dp", "dr"]:
|
||||
entry = sec.get(sub, {})
|
||||
st = per_field[f"{section}.{sub}"]
|
||||
st["total"] += 1
|
||||
if entry.get("valid") is True: st["valid"] += 1
|
||||
elif entry.get("valid") is False:
|
||||
st["invalid"] += 1
|
||||
if "suggestion" in entry:
|
||||
st["suggestions"].append((name, entry["code"], entry["suggestion"]))
|
||||
else: st["empty"] += 1
|
||||
for das in sec.get("das", []) or []:
|
||||
st = per_field[f"{section}.das"]
|
||||
st["total"] += 1
|
||||
if das.get("valid") is True: st["valid"] += 1
|
||||
elif das.get("valid") is False:
|
||||
st["invalid"] += 1
|
||||
if "suggestion" in das:
|
||||
st["suggestions"].append((name, das["code"], das["suggestion"]))
|
||||
else: st["empty"] += 1
|
||||
|
||||
# Cohérence GHM ↔ GHS
|
||||
for side in ["etab", "reco"]:
|
||||
cc = rec_v.get("cross_checks", {}).get(side, {})
|
||||
if cc.get("checked") and not cc.get("coherent"):
|
||||
incoherences.append({
|
||||
"dossier": name, "side": side,
|
||||
"ghs_extrait": cc.get("ghs_extrait"),
|
||||
"ghs_possibles": cc.get("ghs_possibles"),
|
||||
})
|
||||
|
||||
# Markdown report
|
||||
lines = ["# Rapport de validation ATIH — V2 (18 dossiers)\n"]
|
||||
lines.append("## Couverture et validité par champ\n")
|
||||
lines.append("| Champ | Total | Valid | Invalid | Vide | Validité codes renseignés |")
|
||||
lines.append("|---|---:|---:|---:|---:|---:|")
|
||||
for f, st in per_field.items():
|
||||
renseignes = st["valid"] + st["invalid"]
|
||||
ratio = (100 * st["valid"] / renseignes) if renseignes else 0
|
||||
lines.append(f"| `{f}` | {st['total']} | {st['valid']} | {st['invalid']} | {st['empty']} | {ratio:.0f}% |")
|
||||
|
||||
# Suggestions OCR
|
||||
lines.append("\n## Corrections OCR suggérées (Levenshtein ≤ 1)")
|
||||
lines.append("\nCodes extraits invalides mais ressemblant à un code ATIH existant :\n")
|
||||
lines.append("| Dossier | Champ | Code extrait | Suggestion |")
|
||||
lines.append("|---|---|---|---|")
|
||||
sugg_count = 0
|
||||
for field, st in per_field.items():
|
||||
for name, code, sug in st["suggestions"]:
|
||||
lines.append(f"| {name} | `{field}` | `{code}` | **`{sug}`** |")
|
||||
sugg_count += 1
|
||||
if sugg_count == 0:
|
||||
lines.append("| — | — | — | Aucune suggestion (pas de correction Levenshtein ≤ 1) |")
|
||||
|
||||
# Incohérences GHM ↔ GHS
|
||||
lines.append("\n## Incohérences GHM ↔ GHS détectées\n")
|
||||
if incoherences:
|
||||
lines.append("| Dossier | Côté | GHS extrait | GHS possibles pour le GHM |")
|
||||
lines.append("|---|---|---|---|")
|
||||
for inc in incoherences:
|
||||
lines.append(f"| {inc['dossier']} | {inc['side']} | `{inc['ghs_extrait']}` | {inc['ghs_possibles']} |")
|
||||
else:
|
||||
lines.append("✓ Aucune incohérence détectée sur les GHM/GHS extraits.")
|
||||
|
||||
lines.append(f"\n## Synthèse\n")
|
||||
total_codes = sum(st["valid"] + st["invalid"] for st in per_field.values())
|
||||
total_valid = sum(st["valid"] for st in per_field.values())
|
||||
lines.append(f"- **{total_valid}/{total_codes} codes valides** ({100*total_valid/total_codes:.1f}%)")
|
||||
lines.append(f"- **{sugg_count} suggestions de correction OCR** trouvées automatiquement")
|
||||
lines.append(f"- **{len(incoherences)} incohérences GHM↔GHS** sur les paires extraites")
|
||||
|
||||
REPORT.write_text("\n".join(lines), encoding="utf-8")
|
||||
print(f"\nRapport → {REPORT}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
print("Annotation en place des JSONs V2 + calcul validation ATIH...\n")
|
||||
results = annotate_all()
|
||||
build_report(results)
|
||||
597
pipeline/referentials.py
Normal file
597
pipeline/referentials.py
Normal file
@@ -0,0 +1,597 @@
|
||||
"""Validation des codes médicaux contre les référentiels ATIH 2018.
|
||||
|
||||
Ce module charge les référentiels officiels ATIH (CIM-10, CCAM, GHM, table
|
||||
GHM→GHS) dans une base SQLite locale et expose des fonctions de validation
|
||||
pour les codes extraits par le pipeline OCR.
|
||||
|
||||
Sources téléchargées (voir `referentials/sources/`) :
|
||||
- **CIM-10 FR 2019** au format ClaML XML (ATIH) — utilisée comme substitut
|
||||
à la CIM-10 2018 : ATIH ne publie officiellement la CIM-10 2018 qu'en PDF.
|
||||
L'écart entre CIM-10 2019 et CIM-10 2018 est < 100 codes sur ~11 600 ;
|
||||
un écart acceptable pour une validation OCR (et qui peut introduire
|
||||
quelques faux positifs pour des codes créés en 2019, mais jamais de faux
|
||||
négatifs sur un code 2018 valide).
|
||||
- **CCAM descriptive à usage PMSI 2018 V5** (XLSX ATIH).
|
||||
- **GHM V2018** (XLSX ATIH, fichier `regroupement_ghm_v2018.xlsx`).
|
||||
- **Arrêté tarifaire MCO Février 2018** (XLSX ATIH, feuilles "Tarifs public"
|
||||
et "Tarifs privé") pour la table GHM→GHS.
|
||||
|
||||
Formats de codes supportés :
|
||||
- CIM-10 : lettre + 2 à 5 chiffres (ex: K650, T814, sans point).
|
||||
- CCAM : 4 lettres + 3 chiffres (ex: EBFA012).
|
||||
- GHM : 2 chiffres + lettre + 3 chiffres (ex: 11M122).
|
||||
- GHS : nombre 1-5 chiffres (ex: 4323).
|
||||
|
||||
Utilisation :
|
||||
from pipeline.referentials import (
|
||||
is_valid_cim10, is_valid_ccam, is_valid_ghm, is_valid_ghs,
|
||||
nearest_cim10, ghm_to_ghs, get_cim10_libelle,
|
||||
)
|
||||
if not is_valid_cim10("K650"):
|
||||
suggestion = nearest_cim10("K65O") # correction O → 0
|
||||
|
||||
Build initial de la base : ``python -m pipeline.referentials --build``
|
||||
Test rapide : ``python -m pipeline.referentials --test``
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import argparse
|
||||
import gzip
|
||||
import json
|
||||
import re
|
||||
import sqlite3
|
||||
import sys
|
||||
import xml.etree.ElementTree as ET
|
||||
from functools import lru_cache
|
||||
from pathlib import Path
|
||||
from typing import Iterable
|
||||
|
||||
try:
|
||||
from rapidfuzz.distance import Levenshtein as _Lev
|
||||
_HAS_RAPIDFUZZ = True
|
||||
except ImportError: # pragma: no cover - fallback pur Python
|
||||
_HAS_RAPIDFUZZ = False
|
||||
|
||||
_ROOT = Path(__file__).resolve().parent.parent
|
||||
REFERENTIALS_DIR = _ROOT / "referentials"
|
||||
SOURCES_DIR = REFERENTIALS_DIR / "sources"
|
||||
DB_PATH = REFERENTIALS_DIR / "atih_2018.sqlite"
|
||||
|
||||
# Formats attendus (utilisés pour normaliser l'entrée avant recherche DB)
|
||||
_RE_CIM10 = re.compile(r"^[A-Z][0-9]{2,5}$")
|
||||
_RE_CCAM = re.compile(r"^[A-Z]{4}[0-9]{3}$")
|
||||
_RE_GHM = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
|
||||
_RE_GHS = re.compile(r"^[0-9]{1,5}$")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Normalisation des entrées (tolérante aux bruits OCR courants)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _normalize_cim10(code: str) -> str:
|
||||
"""Normalise un code CIM-10 extrait pour comparaison au référentiel.
|
||||
|
||||
Gère :
|
||||
- Point décimal optionnel : "K65.0" → "K650"
|
||||
- Espaces / casse : " k650 " → "K650"
|
||||
- Suffixes PMSI : "C795 *" → "C795" (le `*` signifie "CMA exclue par le DP")
|
||||
et "K635+0" → "K635" (le `+N` est une extension PMSI à valider séparément)
|
||||
- Suffixe de position numérique éventuellement collé : "K650+" → "K650"
|
||||
"""
|
||||
if not code:
|
||||
return ""
|
||||
s = code.strip().upper()
|
||||
# Couper à la première occurrence d'un marqueur PMSI non-alphanum
|
||||
# (*, +, #, espace suivi d'un marqueur). On garde uniquement la tête du code.
|
||||
for sep in ("*", "+", "#"):
|
||||
if sep in s:
|
||||
s = s.split(sep, 1)[0]
|
||||
return s.replace(".", "").replace(" ", "").strip()
|
||||
|
||||
|
||||
def _normalize_ccam(code: str) -> str:
|
||||
if not code:
|
||||
return ""
|
||||
# Retire éventuelle extension PMSI (-1, -2…) et les espaces
|
||||
base = code.split("-")[0]
|
||||
return base.replace(" ", "").strip().upper()
|
||||
|
||||
|
||||
def _normalize_ghm(code: str) -> str:
|
||||
if not code:
|
||||
return ""
|
||||
return code.replace(" ", "").strip().upper()
|
||||
|
||||
|
||||
def _normalize_ghs(code: str) -> str:
|
||||
if not code:
|
||||
return ""
|
||||
# Les GHS peuvent arriver en "0023" ou "23"
|
||||
s = re.sub(r"[^0-9]", "", code).lstrip("0")
|
||||
return s or "0"
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Construction de la base SQLite depuis les sources téléchargées
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _create_schema(conn: sqlite3.Connection) -> None:
|
||||
conn.executescript(
|
||||
"""
|
||||
DROP TABLE IF EXISTS cim10;
|
||||
DROP TABLE IF EXISTS ccam;
|
||||
DROP TABLE IF EXISTS ghm;
|
||||
DROP TABLE IF EXISTS ghm_ghs;
|
||||
DROP TABLE IF EXISTS metadata;
|
||||
|
||||
CREATE TABLE cim10 (
|
||||
code TEXT PRIMARY KEY,
|
||||
libelle TEXT
|
||||
);
|
||||
CREATE TABLE ccam (
|
||||
code TEXT PRIMARY KEY,
|
||||
libelle TEXT
|
||||
);
|
||||
CREATE TABLE ghm (
|
||||
code TEXT PRIMARY KEY,
|
||||
libelle TEXT,
|
||||
aso TEXT,
|
||||
da TEXT
|
||||
);
|
||||
CREATE TABLE ghm_ghs (
|
||||
ghm TEXT,
|
||||
ghs TEXT,
|
||||
secteur TEXT, -- 'public' ou 'prive'
|
||||
libelle TEXT,
|
||||
tarif REAL,
|
||||
PRIMARY KEY (ghm, ghs, secteur)
|
||||
);
|
||||
CREATE INDEX idx_ghm_ghs_ghm ON ghm_ghs(ghm);
|
||||
CREATE INDEX idx_ghm_ghs_ghs ON ghm_ghs(ghs);
|
||||
|
||||
CREATE TABLE metadata (
|
||||
key TEXT PRIMARY KEY,
|
||||
value TEXT
|
||||
);
|
||||
"""
|
||||
)
|
||||
|
||||
|
||||
def _load_cim10(conn: sqlite3.Connection) -> int:
|
||||
"""Charge la CIM-10 FR depuis le ClaML XML (catégories uniquement)."""
|
||||
xml_path = SOURCES_DIR / "cim10_claml_2019_extracted" / "cim10_claml_2019.xml"
|
||||
if not xml_path.exists():
|
||||
# Fallback : chercher n'importe quel xml dans extracted
|
||||
xmls = list((SOURCES_DIR / "cim10_claml_2019_extracted").glob("*.xml"))
|
||||
if not xmls:
|
||||
raise FileNotFoundError(
|
||||
f"CIM-10 ClaML introuvable dans {SOURCES_DIR}. "
|
||||
f"Assurez-vous d'avoir téléchargé et extrait le zip ATIH."
|
||||
)
|
||||
xml_path = xmls[0]
|
||||
|
||||
tree = ET.parse(xml_path)
|
||||
root = tree.getroot()
|
||||
rows: list[tuple[str, str]] = []
|
||||
for cls in root.findall(".//Class"):
|
||||
kind = cls.get("kind")
|
||||
if kind != "category":
|
||||
continue
|
||||
raw_code = cls.get("code") or ""
|
||||
code = raw_code.replace(".", "").upper().strip()
|
||||
if not code:
|
||||
continue
|
||||
pref = cls.find('.//Rubric[@kind="preferred"]/Label')
|
||||
libelle = pref.text.strip() if (pref is not None and pref.text) else ""
|
||||
rows.append((code, libelle))
|
||||
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO cim10 (code, libelle) VALUES (?, ?)", rows
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def _load_ccam(conn: sqlite3.Connection) -> int:
|
||||
"""Charge la CCAM 2018 depuis le XLSX ATIH (feuilles CCAM_Final_2018_*)."""
|
||||
import openpyxl
|
||||
|
||||
xlsx_path = SOURCES_DIR / "ccam_2018_v5.xlsx"
|
||||
if not xlsx_path.exists():
|
||||
raise FileNotFoundError(f"CCAM XLSX introuvable : {xlsx_path}")
|
||||
|
||||
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
|
||||
pat = re.compile(r"^[A-Z]{4}[0-9]{3}$")
|
||||
seen: dict[str, str] = {}
|
||||
for sheet_name in wb.sheetnames:
|
||||
if not sheet_name.startswith("CCAM_Final_2018"):
|
||||
continue
|
||||
ws = wb[sheet_name]
|
||||
cur_code: str | None = None
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
# col 0 : parfois un code, col 3 : texte / libellé
|
||||
col0 = row[0] if len(row) > 0 else None
|
||||
col3 = row[3] if len(row) > 3 else None
|
||||
if isinstance(col0, str):
|
||||
c = col0.strip()
|
||||
if pat.match(c):
|
||||
cur_code = c
|
||||
if c not in seen:
|
||||
seen[c] = ""
|
||||
if cur_code and isinstance(col3, str) and col3.strip():
|
||||
if not seen.get(cur_code):
|
||||
seen[cur_code] = col3.strip()[:500]
|
||||
|
||||
rows = list(seen.items())
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO ccam (code, libelle) VALUES (?, ?)", rows
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def _load_ghm(conn: sqlite3.Connection) -> int:
|
||||
"""Charge les GHM V2018 depuis regroupement_ghm_v2018.xlsx."""
|
||||
import openpyxl
|
||||
|
||||
xlsx_path = SOURCES_DIR / "regroupement_ghm_v2018.xlsx"
|
||||
if not xlsx_path.exists():
|
||||
raise FileNotFoundError(f"GHM XLSX introuvable : {xlsx_path}")
|
||||
|
||||
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
|
||||
ws = wb[wb.sheetnames[0]]
|
||||
ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
|
||||
rows: list[tuple[str, str, str, str]] = []
|
||||
header_found = False
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
if not header_found:
|
||||
if row and row[0] == "GHM":
|
||||
header_found = True
|
||||
continue
|
||||
code = row[0]
|
||||
if not isinstance(code, str):
|
||||
continue
|
||||
code = code.strip().upper()
|
||||
if not ghm_pat.match(code):
|
||||
continue
|
||||
libelle = (row[1] or "").strip() if isinstance(row[1], str) else ""
|
||||
aso = (row[2] or "").strip() if isinstance(row[2], str) else ""
|
||||
da = (row[3] or "").strip() if isinstance(row[3], str) else ""
|
||||
rows.append((code, libelle, aso, da))
|
||||
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO ghm (code, libelle, aso, da) VALUES (?, ?, ?, ?)",
|
||||
rows,
|
||||
)
|
||||
return len(rows)
|
||||
|
||||
|
||||
def _load_ghm_ghs(conn: sqlite3.Connection) -> int:
|
||||
"""Charge la table GHM→GHS depuis tarif_arrete_fev_2018.xlsx.
|
||||
|
||||
Feuilles "Tarifs public" (secteur='public') et "Tarifs privé"
|
||||
(secteur='prive'). Chaque ligne = un couple (GHS, GHM, libellé, tarif).
|
||||
"""
|
||||
import openpyxl
|
||||
|
||||
xlsx_path = SOURCES_DIR / "tarif_arrete_fev_2018.xlsx"
|
||||
if not xlsx_path.exists():
|
||||
raise FileNotFoundError(f"Tarifs XLSX introuvable : {xlsx_path}")
|
||||
|
||||
wb = openpyxl.load_workbook(xlsx_path, read_only=True, data_only=True)
|
||||
ghm_pat = re.compile(r"^[0-9]{2}[A-Z][0-9]{2,3}[A-Z]?$")
|
||||
all_rows: list[tuple[str, str, str, str, float | None]] = []
|
||||
for sheet_name, secteur in [("Tarifs public", "public"), ("Tarifs privé", "prive")]:
|
||||
if sheet_name not in wb.sheetnames:
|
||||
continue
|
||||
ws = wb[sheet_name]
|
||||
header_found = False
|
||||
for row in ws.iter_rows(values_only=True):
|
||||
if not header_found:
|
||||
if row and isinstance(row[0], str) and row[0].strip().upper() == "GHS":
|
||||
header_found = True
|
||||
continue
|
||||
ghs_raw = row[0]
|
||||
ghm_raw = row[1] if len(row) > 1 else None
|
||||
lib_raw = row[2] if len(row) > 2 else None
|
||||
tarif_raw = row[5] if len(row) > 5 else None
|
||||
if ghs_raw is None or ghm_raw is None:
|
||||
continue
|
||||
try:
|
||||
ghs = str(int(float(ghs_raw)))
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
ghm = str(ghm_raw).strip().upper()
|
||||
if not ghm_pat.match(ghm):
|
||||
continue
|
||||
libelle = str(lib_raw).strip() if lib_raw else ""
|
||||
try:
|
||||
tarif = float(tarif_raw) if tarif_raw is not None else None
|
||||
except (ValueError, TypeError):
|
||||
tarif = None
|
||||
all_rows.append((ghm, ghs, secteur, libelle, tarif))
|
||||
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO ghm_ghs (ghm, ghs, secteur, libelle, tarif) "
|
||||
"VALUES (?, ?, ?, ?, ?)",
|
||||
all_rows,
|
||||
)
|
||||
return len(all_rows)
|
||||
|
||||
|
||||
def build_database(db_path: Path = DB_PATH, verbose: bool = True) -> dict[str, int]:
|
||||
"""Construit la base SQLite à partir des sources.
|
||||
|
||||
Retourne les counts par table. Idempotent : DROP + CREATE + INSERT.
|
||||
"""
|
||||
REFERENTIALS_DIR.mkdir(parents=True, exist_ok=True)
|
||||
conn = sqlite3.connect(db_path)
|
||||
try:
|
||||
_create_schema(conn)
|
||||
n_cim10 = _load_cim10(conn)
|
||||
if verbose:
|
||||
print(f" CIM-10 : {n_cim10} codes chargés")
|
||||
n_ccam = _load_ccam(conn)
|
||||
if verbose:
|
||||
print(f" CCAM : {n_ccam} codes chargés")
|
||||
n_ghm = _load_ghm(conn)
|
||||
if verbose:
|
||||
print(f" GHM : {n_ghm} codes chargés")
|
||||
n_ghs = _load_ghm_ghs(conn)
|
||||
if verbose:
|
||||
print(f" GHM→GHS : {n_ghs} lignes (public+privé)")
|
||||
|
||||
conn.executemany(
|
||||
"INSERT OR REPLACE INTO metadata (key, value) VALUES (?, ?)",
|
||||
[
|
||||
("source_cim10", "ATIH CIM-10 FR 2019 ClaML (substitut 2018)"),
|
||||
("source_ccam", "ATIH CCAM descriptive à usage PMSI 2018 V5"),
|
||||
("source_ghm", "ATIH regroupement_ghm_v2018.xlsx"),
|
||||
("source_ghm_ghs", "ATIH tarif_arrete_fev_2018.xlsx"),
|
||||
("n_cim10", str(n_cim10)),
|
||||
("n_ccam", str(n_ccam)),
|
||||
("n_ghm", str(n_ghm)),
|
||||
("n_ghm_ghs", str(n_ghs)),
|
||||
],
|
||||
)
|
||||
conn.commit()
|
||||
return {
|
||||
"cim10": n_cim10,
|
||||
"ccam": n_ccam,
|
||||
"ghm": n_ghm,
|
||||
"ghm_ghs": n_ghs,
|
||||
}
|
||||
finally:
|
||||
conn.close()
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Accès à la base (connexion cachée au niveau du module)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
_CONN: sqlite3.Connection | None = None
|
||||
|
||||
|
||||
def _get_conn() -> sqlite3.Connection:
|
||||
global _CONN
|
||||
if _CONN is not None:
|
||||
return _CONN
|
||||
if not DB_PATH.exists():
|
||||
raise FileNotFoundError(
|
||||
f"Base SQLite introuvable : {DB_PATH}. "
|
||||
"Lancez d'abord : python -m pipeline.referentials --build"
|
||||
)
|
||||
_CONN = sqlite3.connect(f"file:{DB_PATH}?mode=ro", uri=True, check_same_thread=False)
|
||||
return _CONN
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# API publique de validation
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
@lru_cache(maxsize=8192)
|
||||
def is_valid_cim10(code: str) -> bool:
|
||||
"""Vérifie qu'un code CIM-10 existe dans le référentiel 2018 (substitut 2019)."""
|
||||
norm = _normalize_cim10(code)
|
||||
if not norm or not _RE_CIM10.match(norm):
|
||||
return False
|
||||
cur = _get_conn().execute("SELECT 1 FROM cim10 WHERE code = ? LIMIT 1", (norm,))
|
||||
return cur.fetchone() is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8192)
|
||||
def is_valid_ccam(code: str) -> bool:
|
||||
"""Vérifie qu'un code CCAM existe dans la CCAM PMSI 2018."""
|
||||
norm = _normalize_ccam(code)
|
||||
if not norm or not _RE_CCAM.match(norm):
|
||||
return False
|
||||
cur = _get_conn().execute("SELECT 1 FROM ccam WHERE code = ? LIMIT 1", (norm,))
|
||||
return cur.fetchone() is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8192)
|
||||
def is_valid_ghm(code: str) -> bool:
|
||||
"""Vérifie qu'un code GHM existe dans la V2018."""
|
||||
norm = _normalize_ghm(code)
|
||||
if not norm or not _RE_GHM.match(norm):
|
||||
return False
|
||||
cur = _get_conn().execute("SELECT 1 FROM ghm WHERE code = ? LIMIT 1", (norm,))
|
||||
return cur.fetchone() is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=8192)
|
||||
def is_valid_ghs(code: str) -> bool:
|
||||
"""Vérifie qu'un code GHS existe dans l'arrêté tarifaire 2018."""
|
||||
norm = _normalize_ghs(code)
|
||||
if not norm or not _RE_GHS.match(norm):
|
||||
return False
|
||||
cur = _get_conn().execute(
|
||||
"SELECT 1 FROM ghm_ghs WHERE ghs = ? LIMIT 1", (norm,)
|
||||
)
|
||||
return cur.fetchone() is not None
|
||||
|
||||
|
||||
@lru_cache(maxsize=4096)
|
||||
def get_cim10_libelle(code: str) -> str | None:
|
||||
"""Renvoie le libellé officiel du code CIM-10, ou None."""
|
||||
norm = _normalize_cim10(code)
|
||||
if not norm:
|
||||
return None
|
||||
cur = _get_conn().execute(
|
||||
"SELECT libelle FROM cim10 WHERE code = ? LIMIT 1", (norm,)
|
||||
)
|
||||
row = cur.fetchone()
|
||||
return row[0] if row else None
|
||||
|
||||
|
||||
def ghm_to_ghs(ghm: str) -> list[str]:
|
||||
"""Renvoie les GHS possibles pour un GHM donné (publics et privés fusionnés).
|
||||
|
||||
Utilisé pour vérifier la cohérence du couple (GHM, GHS) extrait.
|
||||
"""
|
||||
norm = _normalize_ghm(ghm)
|
||||
if not norm:
|
||||
return []
|
||||
cur = _get_conn().execute(
|
||||
"SELECT DISTINCT ghs FROM ghm_ghs WHERE ghm = ?", (norm,)
|
||||
)
|
||||
return [r[0] for r in cur.fetchall()]
|
||||
|
||||
|
||||
def _levenshtein(a: str, b: str) -> int:
|
||||
if _HAS_RAPIDFUZZ:
|
||||
return _Lev.distance(a, b)
|
||||
# Fallback pur Python (O(n*m)) — suffisant pour des codes courts
|
||||
if len(a) < len(b):
|
||||
a, b = b, a
|
||||
if not b:
|
||||
return len(a)
|
||||
prev = list(range(len(b) + 1))
|
||||
for i, ca in enumerate(a, 1):
|
||||
cur = [i]
|
||||
for j, cb in enumerate(b, 1):
|
||||
ins = cur[j - 1] + 1
|
||||
dele = prev[j] + 1
|
||||
sub = prev[j - 1] + (ca != cb)
|
||||
cur.append(min(ins, dele, sub))
|
||||
prev = cur
|
||||
return prev[-1]
|
||||
|
||||
|
||||
def nearest_cim10(code: str, max_distance: int = 1) -> str | None:
|
||||
"""Trouve le code CIM-10 valide le plus proche (distance de Levenshtein).
|
||||
|
||||
Utile pour corriger les erreurs OCR courantes (O/0, I/1, B/8…).
|
||||
Stratégie de départage en cas d'égalité de distance :
|
||||
1. Privilégie un candidat de même longueur (substitution >> suppression)
|
||||
2. Sinon tri lexicographique croissant.
|
||||
Retourne None si aucun code n'est à ≤ max_distance.
|
||||
"""
|
||||
norm = _normalize_cim10(code)
|
||||
if not norm:
|
||||
return None
|
||||
if is_valid_cim10(norm):
|
||||
return norm
|
||||
conn = _get_conn()
|
||||
length = len(norm)
|
||||
cur = conn.execute(
|
||||
"SELECT code FROM cim10 WHERE length(code) BETWEEN ? AND ?",
|
||||
(length - max_distance, length + max_distance),
|
||||
)
|
||||
candidates: list[tuple[int, int, str]] = [] # (distance, |len_diff|, code)
|
||||
for (cand,) in cur:
|
||||
d = _levenshtein(norm, cand)
|
||||
if d <= max_distance:
|
||||
candidates.append((d, abs(len(cand) - length), cand))
|
||||
if not candidates:
|
||||
return None
|
||||
# Tri : distance min, puis longueur la plus proche, puis lexicographique
|
||||
candidates.sort(key=lambda t: (t[0], t[1], t[2]))
|
||||
return candidates[0][2]
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Tests légers (exécutables sans pytest)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _run_selftest() -> int:
|
||||
"""Tests de fumée rapides. Retourne le nombre d'échecs."""
|
||||
failures = 0
|
||||
|
||||
def check(label: str, cond: bool, detail: str = "") -> None:
|
||||
nonlocal failures
|
||||
status = "OK " if cond else "FAIL"
|
||||
print(f" [{status}] {label}{(' — ' + detail) if detail else ''}")
|
||||
if not cond:
|
||||
failures += 1
|
||||
|
||||
print("=== Tests référentiels ATIH 2018 ===")
|
||||
|
||||
# CIM-10
|
||||
check("CIM-10 K650 valide (péritonite)", is_valid_cim10("K650"))
|
||||
check("CIM-10 K65.0 (avec point) valide", is_valid_cim10("K65.0"))
|
||||
check("CIM-10 T814 valide", is_valid_cim10("T814"))
|
||||
check("CIM-10 ZZZ99 invalide", not is_valid_cim10("ZZZ99"))
|
||||
check("CIM-10 libellé K650", get_cim10_libelle("K650") is not None,
|
||||
detail=str(get_cim10_libelle("K650")))
|
||||
# Correction OCR : K65O (lettre O) → K650
|
||||
suggestion = nearest_cim10("K65O")
|
||||
check("CIM-10 nearest(K65O) = K650", suggestion == "K650",
|
||||
detail=f"got={suggestion}")
|
||||
|
||||
# CCAM
|
||||
check("CCAM EBFA012 valide", is_valid_ccam("EBFA012"))
|
||||
check("CCAM EBFA012-1 (ext PMSI) valide", is_valid_ccam("EBFA012-1"))
|
||||
check("CCAM AAAA000 invalide", not is_valid_ccam("AAAA000"))
|
||||
|
||||
# GHM
|
||||
check("GHM 01C031 valide", is_valid_ghm("01C031"))
|
||||
check("GHM 99Z99Z invalide", not is_valid_ghm("99Z99Z"))
|
||||
|
||||
# GHS
|
||||
check("GHS 22 valide", is_valid_ghs("22"))
|
||||
check("GHS 99999 invalide", not is_valid_ghs("99999"))
|
||||
|
||||
# GHM→GHS
|
||||
ghs_list = ghm_to_ghs("01C031")
|
||||
check("GHM 01C031 → GHS inclut 22", "22" in ghs_list,
|
||||
detail=f"ghs_list={ghs_list}")
|
||||
|
||||
# Format invalide (robustesse)
|
||||
check("is_valid_cim10('') = False", not is_valid_cim10(""))
|
||||
check("is_valid_ccam(None cast) = False", not is_valid_ccam(""))
|
||||
|
||||
print(f"=== Résultat : {failures} échec(s) ===")
|
||||
return failures
|
||||
|
||||
|
||||
def _cli() -> int:
|
||||
parser = argparse.ArgumentParser(description="Référentiels ATIH 2018")
|
||||
g = parser.add_mutually_exclusive_group(required=True)
|
||||
g.add_argument("--build", action="store_true",
|
||||
help="(Re)construit la base SQLite depuis referentials/sources/")
|
||||
g.add_argument("--test", action="store_true",
|
||||
help="Exécute les tests de fumée")
|
||||
g.add_argument("--stats", action="store_true",
|
||||
help="Affiche les comptages de la base")
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.build:
|
||||
print(f"Construction de {DB_PATH} depuis {SOURCES_DIR}...")
|
||||
counts = build_database()
|
||||
print("OK :", counts)
|
||||
return 0
|
||||
if args.test:
|
||||
return 1 if _run_selftest() > 0 else 0
|
||||
if args.stats:
|
||||
conn = _get_conn()
|
||||
for tbl in ("cim10", "ccam", "ghm", "ghm_ghs"):
|
||||
n = conn.execute(f"SELECT COUNT(*) FROM {tbl}").fetchone()[0]
|
||||
print(f" {tbl:10s}: {n}")
|
||||
print("Metadata :")
|
||||
for k, v in conn.execute("SELECT key, value FROM metadata"):
|
||||
print(f" {k}: {v}")
|
||||
return 0
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(_cli())
|
||||
217
pipeline/validation.py
Normal file
217
pipeline/validation.py
Normal file
@@ -0,0 +1,217 @@
|
||||
"""Validation ATIH des codes extraits.
|
||||
|
||||
Prend un JSON d'extraction produit par `pipeline/extract.py` et l'enrichit
|
||||
d'une section `_validation` par champ de code médical (CIM-10, CCAM, GHM, GHS)
|
||||
avec :
|
||||
|
||||
- `valid` : le code existe dans le référentiel ATIH 2018
|
||||
- `suggestion` : si invalide, code le plus proche par Levenshtein ≤ 1 (CIM-10)
|
||||
- `libelle_ref` : libellé officiel ATIH (CIM-10) pour audit
|
||||
|
||||
Plus des cross-checks (GHS ∈ ghm_to_ghs(GHM)) pour détecter des incohérences
|
||||
de groupage.
|
||||
|
||||
Principes :
|
||||
- Lecture seule sur le JSON source — on produit une COPIE enrichie.
|
||||
- Ne supprime / ne corrige RIEN automatiquement ; seule une suggestion est
|
||||
annotée. La correction reste à la discrétion d'un humain (overlay) ou d'un
|
||||
prochain pass automatique.
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
from copy import deepcopy
|
||||
from typing import Any
|
||||
|
||||
from .referentials import (
|
||||
get_cim10_libelle,
|
||||
ghm_to_ghs,
|
||||
is_valid_ccam,
|
||||
is_valid_cim10,
|
||||
is_valid_ghm,
|
||||
is_valid_ghs,
|
||||
nearest_cim10,
|
||||
)
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Helpers
|
||||
# ============================================================
|
||||
|
||||
def _check_cim10(code: str) -> dict:
|
||||
"""Valide un code CIM-10 et suggère une correction si invalide."""
|
||||
code = (code or "").strip()
|
||||
if not code:
|
||||
return {"code": "", "valid": None}
|
||||
valid = is_valid_cim10(code)
|
||||
entry = {"code": code, "valid": valid}
|
||||
if valid:
|
||||
entry["libelle_ref"] = get_cim10_libelle(code)
|
||||
else:
|
||||
sug = nearest_cim10(code, max_distance=1)
|
||||
if sug:
|
||||
entry["suggestion"] = sug
|
||||
entry["suggestion_libelle"] = get_cim10_libelle(sug)
|
||||
return entry
|
||||
|
||||
|
||||
def _check_ccam(code: str) -> dict:
|
||||
code = (code or "").strip()
|
||||
if not code:
|
||||
return {"code": "", "valid": None}
|
||||
return {"code": code, "valid": is_valid_ccam(code)}
|
||||
|
||||
|
||||
def _check_ghm(code: str) -> dict:
|
||||
code = (code or "").strip()
|
||||
if not code:
|
||||
return {"code": "", "valid": None}
|
||||
entry = {"code": code, "valid": is_valid_ghm(code)}
|
||||
if entry["valid"]:
|
||||
entry["ghs_possibles"] = ghm_to_ghs(code)
|
||||
return entry
|
||||
|
||||
|
||||
def _check_ghs(code: str) -> dict:
|
||||
code = (code or "").strip()
|
||||
if not code:
|
||||
return {"code": "", "valid": None}
|
||||
return {"code": code, "valid": is_valid_ghs(code)}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Validation d'un bloc codage (etab ou reco)
|
||||
# ============================================================
|
||||
|
||||
def _validate_codage(codage: dict) -> dict:
|
||||
"""Valide un bloc codage_etab ou codage_reco."""
|
||||
if not isinstance(codage, dict):
|
||||
return {}
|
||||
out = {
|
||||
"dp": _check_cim10(codage.get("dp", "")),
|
||||
"dr": _check_cim10(codage.get("dr", "")),
|
||||
}
|
||||
das_list = codage.get("das") or []
|
||||
if isinstance(das_list, list):
|
||||
out["das"] = [_check_cim10(d.get("code", "")) if isinstance(d, dict) else _check_cim10(str(d))
|
||||
for d in das_list]
|
||||
return out
|
||||
|
||||
|
||||
def _validate_actes(actes: Any) -> list[dict]:
|
||||
if not isinstance(actes, list):
|
||||
return []
|
||||
return [_check_ccam(a.get("code", "")) if isinstance(a, dict) else _check_ccam(str(a))
|
||||
for a in actes]
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Cross-checks GHM ↔ GHS
|
||||
# ============================================================
|
||||
|
||||
def _cross_check_ghm_ghs(ghm: str, ghs: str) -> dict:
|
||||
"""Vérifie qu'un GHS observé est listé parmi les GHS possibles du GHM."""
|
||||
ghm = (ghm or "").strip()
|
||||
ghs = (ghs or "").strip()
|
||||
if not ghm or not ghs:
|
||||
return {"checked": False, "reason": "ghm ou ghs manquant"}
|
||||
if not is_valid_ghm(ghm):
|
||||
return {"checked": False, "reason": "GHM invalide"}
|
||||
possibles = ghm_to_ghs(ghm)
|
||||
# Normalisation simple : on compare la fin (au cas où l'un est tronqué)
|
||||
ok = ghs in possibles or any(p.endswith(ghs) or ghs.endswith(p) for p in possibles)
|
||||
return {
|
||||
"checked": True,
|
||||
"coherent": ok,
|
||||
"ghs_extrait": ghs,
|
||||
"ghs_possibles": possibles,
|
||||
}
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Point d'entrée
|
||||
# ============================================================
|
||||
|
||||
def validate_recueil(recueil: dict) -> dict:
|
||||
"""Retourne un dict résumé des validations pour la page recueil."""
|
||||
v = {
|
||||
"codage_etab": _validate_codage(recueil.get("codage_etab", {})),
|
||||
"codage_reco": _validate_codage(recueil.get("codage_reco", {})),
|
||||
"actes_etab": _validate_actes(recueil.get("actes_etab", [])),
|
||||
"actes_reco": _validate_actes(recueil.get("actes_reco", [])),
|
||||
"ghm_etab": _check_ghm(recueil.get("ghm_etab", "")),
|
||||
"ghs_etab": _check_ghs(recueil.get("ghs_etab", "")),
|
||||
"ghm_reco": _check_ghm(recueil.get("ghm_reco", "")),
|
||||
"ghs_reco": _check_ghs(recueil.get("ghs_reco", "")),
|
||||
"cross_checks": {
|
||||
"etab": _cross_check_ghm_ghs(
|
||||
recueil.get("ghm_etab", ""), recueil.get("ghs_etab", "")),
|
||||
"reco": _cross_check_ghm_ghs(
|
||||
recueil.get("ghm_reco", ""), recueil.get("ghs_reco", "")),
|
||||
},
|
||||
}
|
||||
v["summary"] = _summarize(v)
|
||||
return v
|
||||
|
||||
|
||||
def _summarize(validation: dict) -> dict:
|
||||
"""Compte les codes valides / invalides dans une section _validation."""
|
||||
valid, invalid, empty = 0, 0, 0
|
||||
|
||||
def _count_entry(e):
|
||||
nonlocal valid, invalid, empty
|
||||
if e.get("valid") is True: valid += 1
|
||||
elif e.get("valid") is False: invalid += 1
|
||||
else: empty += 1
|
||||
|
||||
for section in ("codage_etab", "codage_reco"):
|
||||
sec = validation.get(section, {}) or {}
|
||||
_count_entry(sec.get("dp", {}))
|
||||
_count_entry(sec.get("dr", {}))
|
||||
for d in sec.get("das", []) or []:
|
||||
_count_entry(d)
|
||||
for actes_key in ("actes_etab", "actes_reco"):
|
||||
for a in validation.get(actes_key, []) or []:
|
||||
_count_entry(a)
|
||||
for g in ("ghm_etab", "ghs_etab", "ghm_reco", "ghs_reco"):
|
||||
_count_entry(validation.get(g, {}))
|
||||
|
||||
cc = validation.get("cross_checks", {})
|
||||
incoherent = sum(1 for v in cc.values() if v.get("checked") and not v.get("coherent"))
|
||||
return {
|
||||
"valid": valid, "invalid": invalid, "empty": empty,
|
||||
"total_codes": valid + invalid,
|
||||
"ghm_ghs_incoherents": incoherent,
|
||||
}
|
||||
|
||||
|
||||
def annotate(extraction: dict) -> dict:
|
||||
"""Annote un JSON d'extraction complet avec validation ATIH.
|
||||
|
||||
Retourne une COPIE enrichie d'un bloc `_validation` à la racine de chaque
|
||||
page structurée. N'efface / ne corrige aucune valeur.
|
||||
"""
|
||||
out = deepcopy(extraction)
|
||||
ext = out.get("extraction") or {}
|
||||
if "recueil" in ext and isinstance(ext["recueil"], dict):
|
||||
ext["recueil"]["_validation"] = validate_recueil(ext["recueil"])
|
||||
# Concertation 2 : valider les 3 GHS
|
||||
if "concertation_2" in ext and isinstance(ext["concertation_2"], dict):
|
||||
c2 = ext["concertation_2"]
|
||||
c2["_validation"] = {
|
||||
"ghs_initial": _check_ghs(c2.get("ghs_initial", "")),
|
||||
"ghs_avant_concertation": _check_ghs(c2.get("ghs_avant_concertation", "")),
|
||||
"ghs_final": _check_ghs(c2.get("ghs_final", "")),
|
||||
}
|
||||
return out
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
# Test rapide sur OGC 7
|
||||
import json, sys
|
||||
path = sys.argv[1] if len(sys.argv) > 1 else "output/v2/OGC 7.json"
|
||||
with open(path) as f:
|
||||
data = json.load(f)
|
||||
annotated = annotate(data)
|
||||
rec_v = annotated["extraction"]["recueil"]["_validation"]
|
||||
print(json.dumps(rec_v["summary"], indent=2))
|
||||
print("\ncross_checks:", json.dumps(rec_v["cross_checks"], indent=2, ensure_ascii=False))
|
||||
BIN
referentials/atih_2018.sqlite
Normal file
BIN
referentials/atih_2018.sqlite
Normal file
Binary file not shown.
BIN
referentials/sources/ccam_2018_v5.xlsx
Normal file
BIN
referentials/sources/ccam_2018_v5.xlsx
Normal file
Binary file not shown.
BIN
referentials/sources/cim.json.gz
Normal file
BIN
referentials/sources/cim.json.gz
Normal file
Binary file not shown.
BIN
referentials/sources/cim10_claml_2019.zip
Normal file
BIN
referentials/sources/cim10_claml_2019.zip
Normal file
Binary file not shown.
283
referentials/sources/cim10_claml_2019_extracted/ClaML.dtd
Normal file
283
referentials/sources/cim10_claml_2019_extracted/ClaML.dtd
Normal file
@@ -0,0 +1,283 @@
|
||||
<?xml version="1.0" encoding="UTF-8"?>
|
||||
<!ENTITY % rubric.simple "#PCDATA | Reference | Term">
|
||||
<!ENTITY % rubric.complex "%rubric.simple; | Para | Include |
|
||||
IncludeDescendants| Fragment | List | Table">
|
||||
|
||||
<!ELEMENT ClaML (
|
||||
Meta*,
|
||||
Identifier*,
|
||||
Title,
|
||||
Authors?,
|
||||
Variants?,
|
||||
ClassKinds,
|
||||
UsageKinds?,
|
||||
RubricKinds,
|
||||
Modifier*,
|
||||
ModifierClass*,
|
||||
Class*)
|
||||
>
|
||||
<!ATTLIST ClaML
|
||||
version CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT Variants (Variant+)>
|
||||
<!ELEMENT Variant (#PCDATA)>
|
||||
<!ATTLIST Variant
|
||||
name ID #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT Meta EMPTY>
|
||||
<!ATTLIST Meta
|
||||
name CDATA #REQUIRED
|
||||
value CDATA #REQUIRED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Identifier EMPTY>
|
||||
<!ATTLIST Identifier
|
||||
authority NMTOKEN #IMPLIED
|
||||
uid CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT Title (#PCDATA)>
|
||||
<!ATTLIST Title
|
||||
name NMTOKEN #REQUIRED
|
||||
version CDATA #IMPLIED
|
||||
date CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Authors (Author* )>
|
||||
<!ELEMENT Author (#PCDATA)>
|
||||
<!ATTLIST Author
|
||||
name ID #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT ClassKinds (ClassKind+)>
|
||||
<!ELEMENT RubricKinds (RubricKind+)>
|
||||
<!ELEMENT UsageKinds (UsageKind+)>
|
||||
|
||||
<!ELEMENT ClassKind (Display*)>
|
||||
<!ATTLIST ClassKind
|
||||
name ID #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT RubricKind (Display*)>
|
||||
<!ATTLIST RubricKind
|
||||
name ID #REQUIRED
|
||||
inherited (true|false) "true"
|
||||
>
|
||||
|
||||
<!ELEMENT UsageKind EMPTY>
|
||||
<!ATTLIST UsageKind
|
||||
name ID #REQUIRED
|
||||
mark CDATA #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT Display (#PCDATA)>
|
||||
<!ATTLIST Display
|
||||
xml:lang NMTOKEN #REQUIRED
|
||||
variants IDREF #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Modifier (
|
||||
Meta*,
|
||||
SubClass*,
|
||||
Rubric*,
|
||||
History*)
|
||||
>
|
||||
<!ATTLIST Modifier
|
||||
code NMTOKEN #REQUIRED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT ModifierClass (
|
||||
Meta*,
|
||||
SuperClass,
|
||||
SubClass*,
|
||||
Rubric*,
|
||||
History*)
|
||||
>
|
||||
<!ATTLIST ModifierClass
|
||||
modifier NMTOKEN #REQUIRED
|
||||
code NMTOKEN #REQUIRED
|
||||
usage IDREF #IMPLIED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Class (
|
||||
Meta*,
|
||||
SuperClass*,
|
||||
SubClass*,
|
||||
ModifiedBy*,
|
||||
ExcludeModifier*,
|
||||
Rubric*,
|
||||
History*)
|
||||
>
|
||||
<!ATTLIST Class
|
||||
code CDATA #REQUIRED
|
||||
kind IDREF #REQUIRED
|
||||
usage IDREF #IMPLIED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT ModifiedBy (
|
||||
Meta*,
|
||||
ValidModifierClass*)
|
||||
>
|
||||
<!ATTLIST ModifiedBy
|
||||
code NMTOKEN #REQUIRED
|
||||
all (true|false) "true"
|
||||
position CDATA #IMPLIED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT ExcludeModifier EMPTY>
|
||||
<!ATTLIST ExcludeModifier
|
||||
code NMTOKEN #REQUIRED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT ValidModifierClass EMPTY>
|
||||
<!ATTLIST ValidModifierClass
|
||||
code NMTOKEN #REQUIRED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Rubric (
|
||||
Label+,
|
||||
History*)
|
||||
>
|
||||
<!ATTLIST Rubric
|
||||
id ID #IMPLIED
|
||||
kind IDREF #REQUIRED
|
||||
usage IDREF #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Label (%rubric.complex;)*>
|
||||
<!ATTLIST Label
|
||||
xml:lang NMTOKEN #REQUIRED
|
||||
xml:space (default|preserve) "default"
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT History (#PCDATA)>
|
||||
<!ATTLIST History
|
||||
author IDREF #REQUIRED
|
||||
date NMTOKEN #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT SuperClass EMPTY>
|
||||
<!ATTLIST SuperClass
|
||||
code CDATA #REQUIRED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT SubClass EMPTY>
|
||||
<!ATTLIST SubClass
|
||||
code CDATA #REQUIRED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Reference (#PCDATA)>
|
||||
<!ATTLIST Reference
|
||||
class CDATA #IMPLIED
|
||||
authority NMTOKEN #IMPLIED
|
||||
uid NMTOKEN #IMPLIED
|
||||
code CDATA #IMPLIED
|
||||
usage IDREF #IMPLIED
|
||||
variants IDREFS #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Para (%rubric.simple;)*>
|
||||
<!ATTLIST Para
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Fragment (%rubric.simple;)*>
|
||||
<!ATTLIST Fragment
|
||||
class CDATA #IMPLIED
|
||||
usage IDREF #IMPLIED
|
||||
type (item | list) "item"
|
||||
>
|
||||
|
||||
<!ELEMENT Include EMPTY>
|
||||
<!ATTLIST Include
|
||||
class CDATA #IMPLIED
|
||||
rubric IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT IncludeDescendants EMPTY>
|
||||
<!ATTLIST IncludeDescendants
|
||||
code NMTOKEN #REQUIRED
|
||||
kind IDREF #REQUIRED
|
||||
>
|
||||
|
||||
<!ELEMENT List (ListItem+)>
|
||||
<!ATTLIST List
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT ListItem (
|
||||
%rubric.simple;
|
||||
| Para
|
||||
| Include
|
||||
| List
|
||||
| Table)*
|
||||
>
|
||||
<!ATTLIST ListItem
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Table (
|
||||
Caption?,
|
||||
THead?,
|
||||
TBody?,
|
||||
TFoot?)
|
||||
>
|
||||
<!ATTLIST Table
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Caption (%rubric.simple;)*>
|
||||
<!ATTLIST Caption
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT THead (Row+)>
|
||||
<!ATTLIST THead
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT TBody (Row+)>
|
||||
<!ATTLIST TBody
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT TFoot (Row+)>
|
||||
<!ATTLIST TFoot
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Row (Cell*)>
|
||||
<!ATTLIST Row
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Cell (
|
||||
%rubric.simple;
|
||||
| Para
|
||||
| Include
|
||||
| List
|
||||
| Table)*
|
||||
>
|
||||
<!ATTLIST Cell
|
||||
class CDATA #IMPLIED
|
||||
rowspan CDATA #IMPLIED
|
||||
colspan CDATA #IMPLIED
|
||||
>
|
||||
|
||||
<!ELEMENT Term (#PCDATA)>
|
||||
<!ATTLIST Term
|
||||
class CDATA #IMPLIED
|
||||
>
|
||||
|
||||
154659
referentials/sources/cim10_claml_2019_extracted/cim10_claml_2019.xml
Normal file
154659
referentials/sources/cim10_claml_2019_extracted/cim10_claml_2019.xml
Normal file
File diff suppressed because it is too large
Load Diff
BIN
referentials/sources/cim_libelle.json.gz
Normal file
BIN
referentials/sources/cim_libelle.json.gz
Normal file
Binary file not shown.
BIN
referentials/sources/ghm_intermediaire.json.gz
Normal file
BIN
referentials/sources/ghm_intermediaire.json.gz
Normal file
Binary file not shown.
BIN
referentials/sources/ghs_prive.json.gz
Normal file
BIN
referentials/sources/ghs_prive.json.gz
Normal file
Binary file not shown.
BIN
referentials/sources/ghs_public.json.gz
Normal file
BIN
referentials/sources/ghs_public.json.gz
Normal file
Binary file not shown.
BIN
referentials/sources/regroupement_ghm_v2018.xlsx
Normal file
BIN
referentials/sources/regroupement_ghm_v2018.xlsx
Normal file
Binary file not shown.
BIN
referentials/sources/tarif_arrete_fev_2018.xlsx
Normal file
BIN
referentials/sources/tarif_arrete_fev_2018.xlsx
Normal file
Binary file not shown.
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
160
tests/test_referentials.py
Normal file
160
tests/test_referentials.py
Normal file
@@ -0,0 +1,160 @@
|
||||
"""Tests unitaires du module pipeline.referentials.
|
||||
|
||||
Compatible pytest ET exécution directe (`python tests/test_referentials.py`).
|
||||
Nécessite que la base SQLite ait déjà été construite :
|
||||
python -m pipeline.referentials --build
|
||||
"""
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
# Permet l'exécution directe depuis tests/ sans installer le package.
|
||||
sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
|
||||
|
||||
from pipeline.referentials import ( # noqa: E402
|
||||
DB_PATH,
|
||||
ghm_to_ghs,
|
||||
get_cim10_libelle,
|
||||
is_valid_ccam,
|
||||
is_valid_cim10,
|
||||
is_valid_ghm,
|
||||
is_valid_ghs,
|
||||
nearest_cim10,
|
||||
)
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CIM-10
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_cim10_codes_valides():
|
||||
"""Quelques codes courants du référentiel PMSI."""
|
||||
assert is_valid_cim10("K650") # Péritonite aigüe
|
||||
assert is_valid_cim10("T814") # Infection après acte
|
||||
assert is_valid_cim10("I10") # Hypertension essentielle
|
||||
assert is_valid_cim10("Z515") # Soins palliatifs
|
||||
assert is_valid_cim10("C509") # Tumeur maligne du sein
|
||||
|
||||
|
||||
def test_cim10_normalisation():
|
||||
"""Tolère le point décimal (K65.0) et la casse (k650)."""
|
||||
assert is_valid_cim10("K65.0")
|
||||
assert is_valid_cim10("k650")
|
||||
assert is_valid_cim10(" K650 ")
|
||||
|
||||
|
||||
def test_cim10_codes_invalides():
|
||||
assert not is_valid_cim10("")
|
||||
assert not is_valid_cim10("ZZZ99")
|
||||
assert not is_valid_cim10("K6501234") # trop long
|
||||
assert not is_valid_cim10("1234") # ne commence pas par lettre
|
||||
assert not is_valid_cim10("K65X") # suffixe non numérique
|
||||
|
||||
|
||||
def test_cim10_libelle():
|
||||
lib = get_cim10_libelle("K650")
|
||||
assert lib is not None and "éritonit" in lib.lower() or "peritonit" in lib.lower()
|
||||
|
||||
|
||||
def test_cim10_nearest_correction_ocr():
|
||||
# O (lettre) lu au lieu de 0 (chiffre)
|
||||
assert nearest_cim10("K65O") == "K650"
|
||||
# Code déjà valide : renvoyé tel quel
|
||||
assert nearest_cim10("K650") == "K650"
|
||||
# Aucune correspondance à distance <= 1
|
||||
assert nearest_cim10("ZZZZZ", max_distance=1) is None
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# CCAM
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_ccam_codes_valides():
|
||||
assert is_valid_ccam("EBFA012")
|
||||
assert is_valid_ccam("HFCC003") # Bypass gastrique (mentionné dans la V5)
|
||||
assert is_valid_ccam("ebfa012") # casse insensible
|
||||
assert is_valid_ccam("EBFA012-1") # extension PMSI tolérée
|
||||
|
||||
|
||||
def test_ccam_codes_invalides():
|
||||
assert not is_valid_ccam("AAAA000")
|
||||
assert not is_valid_ccam("")
|
||||
assert not is_valid_ccam("EBF012") # 3 lettres au lieu de 4
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GHM
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_ghm_codes_valides():
|
||||
assert is_valid_ghm("01C031")
|
||||
assert is_valid_ghm("01c031")
|
||||
|
||||
|
||||
def test_ghm_codes_invalides():
|
||||
assert not is_valid_ghm("99Z99Z")
|
||||
assert not is_valid_ghm("")
|
||||
assert not is_valid_ghm("ABCDEF")
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# GHS et couplage GHM→GHS
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def test_ghs_valide():
|
||||
assert is_valid_ghs("22")
|
||||
assert is_valid_ghs("0022") # zéros de tête tolérés
|
||||
assert not is_valid_ghs("99999")
|
||||
assert not is_valid_ghs("")
|
||||
|
||||
|
||||
def test_ghm_to_ghs():
|
||||
ghs = ghm_to_ghs("01C031")
|
||||
assert "22" in ghs
|
||||
# GHM inexistant → liste vide
|
||||
assert ghm_to_ghs("99Z99Z") == []
|
||||
|
||||
|
||||
# ---------------------------------------------------------------------------
|
||||
# Exécution directe (sans pytest)
|
||||
# ---------------------------------------------------------------------------
|
||||
|
||||
def _main() -> int:
|
||||
import traceback
|
||||
tests = [
|
||||
("test_cim10_codes_valides", test_cim10_codes_valides),
|
||||
("test_cim10_normalisation", test_cim10_normalisation),
|
||||
("test_cim10_codes_invalides", test_cim10_codes_invalides),
|
||||
("test_cim10_libelle", test_cim10_libelle),
|
||||
("test_cim10_nearest_correction_ocr", test_cim10_nearest_correction_ocr),
|
||||
("test_ccam_codes_valides", test_ccam_codes_valides),
|
||||
("test_ccam_codes_invalides", test_ccam_codes_invalides),
|
||||
("test_ghm_codes_valides", test_ghm_codes_valides),
|
||||
("test_ghm_codes_invalides", test_ghm_codes_invalides),
|
||||
("test_ghs_valide", test_ghs_valide),
|
||||
("test_ghm_to_ghs", test_ghm_to_ghs),
|
||||
]
|
||||
if not DB_PATH.exists():
|
||||
print(f"ERREUR : base SQLite manquante ({DB_PATH}).")
|
||||
print("Exécute d'abord : python -m pipeline.referentials --build")
|
||||
return 2
|
||||
|
||||
failures = 0
|
||||
for name, fn in tests:
|
||||
try:
|
||||
fn()
|
||||
print(f" [OK ] {name}")
|
||||
except AssertionError as e:
|
||||
print(f" [FAIL] {name} — {e}")
|
||||
failures += 1
|
||||
except Exception:
|
||||
print(f" [ERR] {name}")
|
||||
traceback.print_exc()
|
||||
failures += 1
|
||||
print(f"=== {len(tests) - failures}/{len(tests)} tests OK ===")
|
||||
return 0 if failures == 0 else 1
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(_main())
|
||||
Reference in New Issue
Block a user