feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp

Pipeline complet de traitement de documents médicaux PDF :
- Extraction texte (pdfplumber) et classification (Trackare/CRH)
- Anonymisation multi-couche (regex + NER CamemBERT + sweep)
- Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les
  diagnostics, médicaments (codes ATC via Romedi) et négation,
  avec fallback regex pour les patterns spécifiques
- Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
dom
2026-02-10 15:24:12 +01:00
commit 4a12cd2676
25 changed files with 7592 additions and 0 deletions

184
src/main.py Normal file
View File

@@ -0,0 +1,184 @@
"""CLI + orchestrateur du pipeline d'anonymisation et extraction CIM-10."""
from __future__ import annotations
import argparse
import json
import logging
import sys
from pathlib import Path
from .anonymization.anonymizer import Anonymizer
from .config import ANONYMIZED_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical
from .extraction.document_classifier import classify
from .extraction.crh_parser import parse_crh
from .extraction.pdf_extractor import extract_text
from .extraction.trackare_parser import parse_trackare
from .medical.cim10_extractor import extract_medical_info
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
)
logger = logging.getLogger(__name__)
# Flag global pour désactiver edsnlp
_use_edsnlp = True
def process_pdf(pdf_path: Path) -> tuple[str, DossierMedical, AnonymizationReport]:
"""Traite un PDF : extraction → parsing → anonymisation → extraction CIM-10."""
logger.info("Traitement de %s", pdf_path.name)
# 1. Extraction texte
raw_text = extract_text(pdf_path)
logger.info(" Texte extrait : %d caractères", len(raw_text))
# 2. Classification
doc_type = classify(raw_text)
logger.info(" Type de document : %s", doc_type)
# 3. Parsing
if doc_type == "trackare":
parsed = parse_trackare(raw_text)
else:
parsed = parse_crh(raw_text)
# 4. Anonymisation
anonymizer = Anonymizer(parsed_data=parsed)
anonymized_text = anonymizer.anonymize(raw_text)
report = anonymizer.report
report.source_file = pdf_path.name
logger.info(
" Anonymisation : %d remplacements (regex=%d, ner=%d, sweep=%d)",
report.total_replacements,
report.regex_replacements,
report.ner_replacements,
report.sweep_replacements,
)
# 5. Analyse edsnlp (optionnelle)
edsnlp_result = None
if _use_edsnlp:
edsnlp_result = _run_edsnlp(anonymized_text)
# 6. Extraction médicale CIM-10
dossier = extract_medical_info(parsed, anonymized_text, edsnlp_result)
dossier.source_file = pdf_path.name
dossier.document_type = doc_type
logger.info(" DP : %s", dossier.diagnostic_principal)
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
return anonymized_text, dossier, report
def _run_edsnlp(text: str):
"""Exécute l'analyse edsnlp avec fallback gracieux."""
try:
from .medical.edsnlp_pipeline import analyze, is_available
if not is_available():
logger.info(" edsnlp non disponible, utilisation du mode regex seul")
return None
result = analyze(text)
logger.info(
" edsnlp : %d CIM-10, %d médicaments, %d dates",
len(result.cim10_entities),
len(result.drug_entities),
len(result.date_entities),
)
return result
except Exception:
logger.warning(" edsnlp : erreur lors de l'analyse, fallback regex", exc_info=True)
return None
def write_outputs(
stem: str,
anonymized_text: str,
dossier: DossierMedical,
report: AnonymizationReport,
) -> None:
"""Écrit les fichiers de sortie."""
# Texte anonymisé
anon_path = ANONYMIZED_DIR / f"{stem}_anonymized.txt"
anon_path.write_text(anonymized_text, encoding="utf-8")
logger.info("%s", anon_path)
# JSON structuré
json_path = STRUCTURED_DIR / f"{stem}_cim10.json"
json_path.write_text(
dossier.model_dump_json(indent=2, exclude_none=True),
encoding="utf-8",
)
logger.info("%s", json_path)
# Rapport d'anonymisation
report_path = REPORTS_DIR / f"{stem}_report.json"
report_path.write_text(
report.model_dump_json(indent=2),
encoding="utf-8",
)
logger.info("%s", report_path)
def main(input_path: str | None = None) -> None:
"""Point d'entrée principal."""
global _use_edsnlp
parser = argparse.ArgumentParser(
description="Anonymisation de documents médicaux PDF et extraction CIM-10",
)
parser.add_argument(
"input",
nargs="?",
default=input_path or "input/",
help="Chemin vers un PDF ou un dossier de PDFs (défaut: input/)",
)
parser.add_argument(
"--no-ner",
action="store_true",
help="Désactiver la phase NER (plus rapide, moins précis)",
)
parser.add_argument(
"--no-edsnlp",
action="store_true",
help="Désactiver l'analyse edsnlp (mode regex seul)",
)
args = parser.parse_args()
if args.no_ner:
# Monkey-patch pour désactiver NER
from .anonymization import ner_anonymizer
ner_anonymizer.extract_person_entities = lambda text: []
if args.no_edsnlp:
_use_edsnlp = False
input_p = Path(args.input)
if input_p.is_file():
pdfs = [input_p]
elif input_p.is_dir():
pdfs = sorted(input_p.glob("*.pdf"))
else:
logger.error("Chemin introuvable : %s", input_p)
sys.exit(1)
if not pdfs:
logger.warning("Aucun PDF trouvé dans %s", input_p)
sys.exit(0)
logger.info("Traitement de %d PDF(s)...", len(pdfs))
for pdf_path in pdfs:
try:
anonymized_text, dossier, report = process_pdf(pdf_path)
stem = pdf_path.stem.replace(" ", "_")
write_outputs(stem, anonymized_text, dossier, report)
except Exception:
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
logger.info("Terminé.")
if __name__ == "__main__":
main()