feat: pipeline T2A - anonymisation, extraction CIM-10 et intégration edsnlp
Pipeline complet de traitement de documents médicaux PDF : - Extraction texte (pdfplumber) et classification (Trackare/CRH) - Anonymisation multi-couche (regex + NER CamemBERT + sweep) - Extraction médicale CIM-10 hybride : edsnlp (AP-HP) enrichit les diagnostics, médicaments (codes ATC via Romedi) et négation, avec fallback regex pour les patterns spécifiques - Fix sentencepiece pinné à <0.2.0 pour compatibilité CamemBERT Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
8
.gitignore
vendored
Normal file
8
.gitignore
vendored
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
.venv/
|
||||||
|
__pycache__/
|
||||||
|
*.pyc
|
||||||
|
.pytest_cache/
|
||||||
|
.hypothesis/
|
||||||
|
output/
|
||||||
|
input/
|
||||||
|
*.egg-info/
|
||||||
254
analyze_pdfs.py
Normal file
254
analyze_pdfs.py
Normal file
@@ -0,0 +1,254 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Analyse structurelle detaillee des PDFs dans /home/dom/ai/t2a/input/
|
||||||
|
Utilise pdfplumber pour extraire texte, tableaux, headers et donnees personnelles.
|
||||||
|
"""
|
||||||
|
|
||||||
|
import pdfplumber
|
||||||
|
import os
|
||||||
|
import re
|
||||||
|
|
||||||
|
INPUT_DIR = "/home/dom/ai/t2a/input/"
|
||||||
|
REPORT_FILE = "/home/dom/ai/t2a/rapport_analyse_pdfs.md"
|
||||||
|
|
||||||
|
# Patterns pour detecter des donnees personnelles
|
||||||
|
PATTERNS = {
|
||||||
|
"telephone": re.compile(r'(?:\+?\d{1,3}[\s.-]?)?\(?\d{2,4}\)?[\s.-]?\d{2,4}[\s.-]?\d{2,4}[\s.-]?\d{0,4}'),
|
||||||
|
"email": re.compile(r'[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}'),
|
||||||
|
"code_postal": re.compile(r'\b\d{5}\b'),
|
||||||
|
"numero_dossier": re.compile(r'\b\d{7,10}\b'),
|
||||||
|
"date": re.compile(r'\b\d{1,2}[/.-]\d{1,2}[/.-]\d{2,4}\b'),
|
||||||
|
"montant_euro": re.compile(r'\d+[\s.,]?\d*\s*[€]|\d+[\s.,]?\d*\s*EUR'),
|
||||||
|
}
|
||||||
|
|
||||||
|
def analyze_pdf(filepath):
|
||||||
|
"""Analyse complete d'un PDF."""
|
||||||
|
result = {
|
||||||
|
"filename": os.path.basename(filepath),
|
||||||
|
"filepath": filepath,
|
||||||
|
"pages": [],
|
||||||
|
"tables_all": [],
|
||||||
|
"full_text": "",
|
||||||
|
"headers_detected": [],
|
||||||
|
"personal_data": {},
|
||||||
|
"metadata": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
with pdfplumber.open(filepath) as pdf:
|
||||||
|
result["metadata"] = {
|
||||||
|
"num_pages": len(pdf.pages),
|
||||||
|
"pdf_metadata": pdf.metadata if pdf.metadata else {},
|
||||||
|
}
|
||||||
|
|
||||||
|
for i, page in enumerate(pdf.pages):
|
||||||
|
page_info = {
|
||||||
|
"page_num": i + 1,
|
||||||
|
"width": page.width,
|
||||||
|
"height": page.height,
|
||||||
|
"text": "",
|
||||||
|
"tables": [],
|
||||||
|
"lines_count": 0,
|
||||||
|
"chars_count": 0,
|
||||||
|
"rects_count": 0,
|
||||||
|
"images_count": 0,
|
||||||
|
}
|
||||||
|
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
page_info["text"] = text
|
||||||
|
page_info["lines_count"] = len(text.split('\n')) if text else 0
|
||||||
|
|
||||||
|
page_info["chars_count"] = len(page.chars) if page.chars else 0
|
||||||
|
page_info["rects_count"] = len(page.rects) if page.rects else 0
|
||||||
|
page_info["images_count"] = len(page.images) if page.images else 0
|
||||||
|
|
||||||
|
tables = page.extract_tables() or []
|
||||||
|
for t_idx, table in enumerate(tables):
|
||||||
|
table_info = {
|
||||||
|
"table_index": t_idx,
|
||||||
|
"page": i + 1,
|
||||||
|
"rows": len(table),
|
||||||
|
"cols": max(len(row) for row in table) if table else 0,
|
||||||
|
"data": table,
|
||||||
|
"header_row": table[0] if table else [],
|
||||||
|
}
|
||||||
|
page_info["tables"].append(table_info)
|
||||||
|
result["tables_all"].append(table_info)
|
||||||
|
|
||||||
|
result["pages"].append(page_info)
|
||||||
|
result["full_text"] += f"\n--- PAGE {i+1} ---\n{text}\n"
|
||||||
|
|
||||||
|
# Detecter les headers/sections
|
||||||
|
for line in result["full_text"].split('\n'):
|
||||||
|
stripped = line.strip()
|
||||||
|
if not stripped:
|
||||||
|
continue
|
||||||
|
if stripped.startswith("--- PAGE"):
|
||||||
|
continue
|
||||||
|
if len(stripped) >= 3 and stripped == stripped.upper() and any(c.isalpha() for c in stripped):
|
||||||
|
result["headers_detected"].append(stripped)
|
||||||
|
elif len(stripped) < 80 and stripped[0].isupper() and ':' in stripped:
|
||||||
|
result["headers_detected"].append(stripped)
|
||||||
|
|
||||||
|
# Detecter les donnees personnelles
|
||||||
|
for pattern_name, pattern in PATTERNS.items():
|
||||||
|
matches = pattern.findall(result["full_text"])
|
||||||
|
if matches:
|
||||||
|
unique_matches = list(set(m.strip() for m in matches if len(m.strip()) > 3))
|
||||||
|
if unique_matches:
|
||||||
|
result["personal_data"][pattern_name] = unique_matches
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def format_table_for_md(table_data, max_rows=30):
|
||||||
|
"""Formate un tableau en Markdown."""
|
||||||
|
if not table_data:
|
||||||
|
return "_Tableau vide_"
|
||||||
|
|
||||||
|
lines = []
|
||||||
|
max_cols = max(len(row) for row in table_data)
|
||||||
|
|
||||||
|
normalized = []
|
||||||
|
for row in table_data[:max_rows]:
|
||||||
|
norm_row = []
|
||||||
|
for j in range(max_cols):
|
||||||
|
if j < len(row) and row[j] is not None:
|
||||||
|
cell = str(row[j]).replace('\n', ' ').replace('|', '/').strip()
|
||||||
|
norm_row.append(cell if cell else "")
|
||||||
|
else:
|
||||||
|
norm_row.append("")
|
||||||
|
normalized.append(norm_row)
|
||||||
|
|
||||||
|
lines.append("| " + " | ".join(normalized[0]) + " |")
|
||||||
|
lines.append("| " + " | ".join(["---"] * max_cols) + " |")
|
||||||
|
|
||||||
|
for row in normalized[1:]:
|
||||||
|
lines.append("| " + " | ".join(row) + " |")
|
||||||
|
|
||||||
|
if len(table_data) > max_rows:
|
||||||
|
lines.append(f"\n_... ({len(table_data) - max_rows} lignes supplementaires non affichees)_")
|
||||||
|
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
|
def generate_report(analyses):
|
||||||
|
"""Genere le rapport Markdown."""
|
||||||
|
report = []
|
||||||
|
report.append("# Rapport d'analyse structurelle des PDFs")
|
||||||
|
report.append(f"\n**Repertoire analyse :** `{INPUT_DIR}`")
|
||||||
|
report.append(f"**Nombre de fichiers :** {len(analyses)}")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
for idx, analysis in enumerate(analyses, 1):
|
||||||
|
report.append(f"\n{'='*80}")
|
||||||
|
report.append(f"## {idx}. {analysis['filename']}")
|
||||||
|
report.append(f"{'='*80}\n")
|
||||||
|
|
||||||
|
meta = analysis["metadata"]
|
||||||
|
report.append("### Metadonnees du PDF")
|
||||||
|
report.append(f"- **Nombre de pages :** {meta['num_pages']}")
|
||||||
|
if meta.get("pdf_metadata"):
|
||||||
|
for k, v in meta["pdf_metadata"].items():
|
||||||
|
if v:
|
||||||
|
report.append(f"- **{k} :** {v}")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("### Structure par page")
|
||||||
|
for page in analysis["pages"]:
|
||||||
|
report.append(f"\n#### Page {page['page_num']}")
|
||||||
|
report.append(f"- **Dimensions :** {page['width']} x {page['height']} pts")
|
||||||
|
report.append(f"- **Lignes de texte :** {page['lines_count']}")
|
||||||
|
report.append(f"- **Caracteres (objets) :** {page['chars_count']}")
|
||||||
|
report.append(f"- **Rectangles :** {page['rects_count']}")
|
||||||
|
report.append(f"- **Images :** {page['images_count']}")
|
||||||
|
report.append(f"- **Tableaux detectes :** {len(page['tables'])}")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("### Texte complet extrait")
|
||||||
|
report.append("```")
|
||||||
|
report.append(analysis["full_text"].strip())
|
||||||
|
report.append("```")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
if analysis["tables_all"]:
|
||||||
|
report.append(f"### Tableaux detectes ({len(analysis['tables_all'])} au total)")
|
||||||
|
for t in analysis["tables_all"]:
|
||||||
|
report.append(f"\n#### Tableau {t['table_index']+1} (Page {t['page']}) - {t['rows']} lignes x {t['cols']} colonnes")
|
||||||
|
report.append("")
|
||||||
|
report.append(format_table_for_md(t["data"]))
|
||||||
|
report.append("")
|
||||||
|
else:
|
||||||
|
report.append("### Tableaux detectes")
|
||||||
|
report.append("_Aucun tableau detecte par pdfplumber._\n")
|
||||||
|
|
||||||
|
report.append("### Sections / Headers identifies")
|
||||||
|
if analysis["headers_detected"]:
|
||||||
|
seen = set()
|
||||||
|
for h in analysis["headers_detected"]:
|
||||||
|
if h not in seen:
|
||||||
|
report.append(f"- `{h}`")
|
||||||
|
seen.add(h)
|
||||||
|
else:
|
||||||
|
report.append("_Aucun header identifie._")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append("### Donnees personnelles detectees")
|
||||||
|
if analysis["personal_data"]:
|
||||||
|
for category, values in analysis["personal_data"].items():
|
||||||
|
report.append(f"\n**{category.replace('_', ' ').title()} :**")
|
||||||
|
for v in sorted(values):
|
||||||
|
report.append(f"- `{v}`")
|
||||||
|
else:
|
||||||
|
report.append("_Aucune donnee personnelle detectee._")
|
||||||
|
report.append("")
|
||||||
|
|
||||||
|
report.append(f"\n{'='*80}")
|
||||||
|
report.append("## Resume comparatif")
|
||||||
|
report.append(f"{'='*80}\n")
|
||||||
|
|
||||||
|
report.append("| Caracteristique | " + " | ".join(a["filename"] for a in analyses) + " |")
|
||||||
|
report.append("| --- | " + " | ".join(["---"] * len(analyses)) + " |")
|
||||||
|
report.append("| Pages | " + " | ".join(str(a["metadata"]["num_pages"]) for a in analyses) + " |")
|
||||||
|
report.append("| Tableaux | " + " | ".join(str(len(a["tables_all"])) for a in analyses) + " |")
|
||||||
|
report.append("| Headers | " + " | ".join(str(len(set(a["headers_detected"]))) for a in analyses) + " |")
|
||||||
|
report.append("| Longueur texte | " + " | ".join(str(len(a["full_text"])) + " chars" for a in analyses) + " |")
|
||||||
|
|
||||||
|
return "\n".join(report)
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
pdf_files = sorted([
|
||||||
|
os.path.join(INPUT_DIR, f)
|
||||||
|
for f in os.listdir(INPUT_DIR)
|
||||||
|
if f.lower().endswith('.pdf')
|
||||||
|
])
|
||||||
|
|
||||||
|
print(f"Fichiers PDF trouves : {len(pdf_files)}")
|
||||||
|
for f in pdf_files:
|
||||||
|
print(f" - {f}")
|
||||||
|
|
||||||
|
analyses = []
|
||||||
|
for filepath in pdf_files:
|
||||||
|
print(f"\nAnalyse de : {os.path.basename(filepath)} ...")
|
||||||
|
analysis = analyze_pdf(filepath)
|
||||||
|
analyses.append(analysis)
|
||||||
|
print(f" Pages: {analysis['metadata']['num_pages']}")
|
||||||
|
print(f" Tableaux: {len(analysis['tables_all'])}")
|
||||||
|
print(f" Headers: {len(set(analysis['headers_detected']))}")
|
||||||
|
print(f" Texte: {len(analysis['full_text'])} chars")
|
||||||
|
|
||||||
|
report = generate_report(analyses)
|
||||||
|
|
||||||
|
with open(REPORT_FILE, "w", encoding="utf-8") as f:
|
||||||
|
f.write(report)
|
||||||
|
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Rapport ecrit dans : {REPORT_FILE}")
|
||||||
|
print(f"{'='*60}")
|
||||||
|
|
||||||
|
print("\n")
|
||||||
|
print(report)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
4075
rapport_analyse_pdfs.md
Normal file
4075
rapport_analyse_pdfs.md
Normal file
File diff suppressed because it is too large
Load Diff
8
requirements.txt
Normal file
8
requirements.txt
Normal file
@@ -0,0 +1,8 @@
|
|||||||
|
pdfplumber>=0.10.0
|
||||||
|
transformers>=4.35.0
|
||||||
|
torch>=2.1.0
|
||||||
|
regex>=2023.0
|
||||||
|
pydantic>=2.5.0
|
||||||
|
pytest>=7.4.0
|
||||||
|
sentencepiece>=0.1.99,<0.2.0
|
||||||
|
edsnlp[ml]>=0.17.0
|
||||||
0
src/__init__.py
Normal file
0
src/__init__.py
Normal file
0
src/anonymization/__init__.py
Normal file
0
src/anonymization/__init__.py
Normal file
529
src/anonymization/anonymizer.py
Normal file
529
src/anonymization/anonymizer.py
Normal file
@@ -0,0 +1,529 @@
|
|||||||
|
"""Pipeline d'anonymisation en 3 phases : regex → NER → balayage final."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
import re
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
|
import regex as regex_mod
|
||||||
|
|
||||||
|
from ..config import KEEP_ESTABLISHMENT_NAME, AnonymizationReport
|
||||||
|
from . import regex_patterns as patterns
|
||||||
|
from .entity_registry import EntityRegistry
|
||||||
|
from .ner_anonymizer import extract_person_entities
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Termes médicaux à ne pas anonymiser même s'ils ressemblent à des noms
|
||||||
|
MEDICAL_TERMS_WHITELIST = {
|
||||||
|
"balthazar", "sris", "ras", "atg", "pca", "bcy", "bcr",
|
||||||
|
"nac", "nacl", "asat", "alat", "ggt", "pal", "crp", "imc",
|
||||||
|
"en", "pa", "fc", "vvp", "ide", "iao", "mco", "urg", "bh",
|
||||||
|
"kt", "vbp", "iv", "ap", "am", "ok", "apres", "sous",
|
||||||
|
"normal", "normaux", "stable", "absent", "absente",
|
||||||
|
"date", "heure", "type", "note", "etat", "code",
|
||||||
|
"orale", "intraveineuse", "signé", "arrêté", "réalisé",
|
||||||
|
# Termes médicaux fréquents à ne jamais anonymiser
|
||||||
|
"cholécystectomie", "cholecystectomie", "cholangiographie",
|
||||||
|
"pancréatite", "pancreatite", "lithiase", "lithiases",
|
||||||
|
"cœlioscopie", "coelioscopie", "cholédoque", "choledoque",
|
||||||
|
"angiocholite", "cholécystite", "cholecystite",
|
||||||
|
"morphine", "paracétamol", "paracetamol", "cétirizine", "cetirizine",
|
||||||
|
"tramadol", "contramal", "acupan", "nefopam",
|
||||||
|
"service", "médecin", "medecin", "docteur", "chirurgie",
|
||||||
|
"gastro", "entérologie", "enterologie", "oncologie",
|
||||||
|
"hépato", "hepato", "digestif", "digestive",
|
||||||
|
"proctologue", "nutritive", "pôle", "pole",
|
||||||
|
"fonct", "fonctionnelle", "fonctionnelles",
|
||||||
|
"praticiens", "hospitaliers", "interne", "clinique",
|
||||||
|
"desc", "chef",
|
||||||
|
"secrétariat", "infirmier", "infirmière",
|
||||||
|
"unité", "hospitalisation", "urgences",
|
||||||
|
"coordonnateur", "fédération", "federation",
|
||||||
|
"navarre", "institut", "cancérologie",
|
||||||
|
"bordeaux", "strasbourg", "reims", "limoges", "clermont", "ferrand",
|
||||||
|
"palais",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Noms d'établissement à préserver si configuré
|
||||||
|
ESTABLISHMENT_NAMES = {
|
||||||
|
"centre hospitalier cote basque",
|
||||||
|
"centre hospitalier côte basque",
|
||||||
|
"ch-cotebasque",
|
||||||
|
"icance",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
class Anonymizer:
|
||||||
|
"""Anonymiseur 3 phases pour documents médicaux."""
|
||||||
|
|
||||||
|
def __init__(self, parsed_data: dict | None = None):
|
||||||
|
self.registry = EntityRegistry(whitelist=MEDICAL_TERMS_WHITELIST)
|
||||||
|
self.report = AnonymizationReport(source_file="")
|
||||||
|
self._parsed = parsed_data or {}
|
||||||
|
|
||||||
|
# Pré-enregistrer les entités connues du parsing
|
||||||
|
self._register_parsed_entities()
|
||||||
|
|
||||||
|
def anonymize(self, text: str) -> str:
|
||||||
|
"""Exécute les 3 phases d'anonymisation."""
|
||||||
|
text = self._phase1_regex(text)
|
||||||
|
text = self._phase2_ner(text)
|
||||||
|
text = self._phase3_sweep(text)
|
||||||
|
|
||||||
|
self.report.total_replacements = (
|
||||||
|
self.report.regex_replacements
|
||||||
|
+ self.report.ner_replacements
|
||||||
|
+ self.report.sweep_replacements
|
||||||
|
)
|
||||||
|
return text
|
||||||
|
|
||||||
|
# --- Phase 1 : Regex ---
|
||||||
|
|
||||||
|
def _phase1_regex(self, text: str) -> str:
|
||||||
|
"""Anonymisation par patterns regex."""
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
# CRH footer combiné (IPP + Episode sur la même ligne)
|
||||||
|
text, n = self._replace_crh_footer_ipp_episode(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Identifiants
|
||||||
|
text, n = self._replace_pattern(
|
||||||
|
text, patterns.IPP_PATTERN, "ipp",
|
||||||
|
group_handler=self._handle_multi_group,
|
||||||
|
)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(
|
||||||
|
text, patterns.EPISODE_PATTERN, "episode",
|
||||||
|
group_handler=self._handle_multi_group,
|
||||||
|
)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(text, patterns.NIR_PATTERN, "nir")
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(text, patterns.FINESS_PATTERN, "finess")
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(text, patterns.RPPS_PATTERN, "rpps")
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(text, patterns.BARCODE_PATTERN, "code_barre")
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(text, patterns.BARCODE_REPEAT_PATTERN, "code_barre")
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Contact
|
||||||
|
text, n = self._replace_phone(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_pattern(
|
||||||
|
text, patterns.EMAIL_PATTERN, "email",
|
||||||
|
skip_establishment_check=True,
|
||||||
|
)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
text, n = self._replace_fax(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Adresses
|
||||||
|
text, n = self._replace_addresses(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Scanner les patterns d'adresse inline (MAISON xxx, QUARTIER xxx...)
|
||||||
|
text, n = self._replace_inline_addresses(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Dates de naissance
|
||||||
|
text, n = self._replace_date_naissance(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Lieu de naissance
|
||||||
|
text, n = self._replace_pattern(
|
||||||
|
text, patterns.LIEU_NAISSANCE_PATTERN, "lieu_naissance",
|
||||||
|
)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Noms structurés
|
||||||
|
text, n = self._replace_structured_names(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
# Footers (Trackare et CRH)
|
||||||
|
text, n = self._replace_footer(text)
|
||||||
|
count += n
|
||||||
|
|
||||||
|
self.report.regex_replacements = count
|
||||||
|
return text
|
||||||
|
|
||||||
|
# --- Phase 2 : NER ---
|
||||||
|
|
||||||
|
def _phase2_ner(self, text: str) -> str:
|
||||||
|
"""Anonymisation par NER CamemBERT."""
|
||||||
|
try:
|
||||||
|
ner_entities = extract_person_entities(text)
|
||||||
|
except Exception as e:
|
||||||
|
logger.warning("NER indisponible (%s), phase 2 ignorée.", e)
|
||||||
|
return text
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
# Trier par position décroissante pour remplacer de la fin au début
|
||||||
|
ner_entities.sort(key=lambda e: e["start"], reverse=True)
|
||||||
|
|
||||||
|
for ent in ner_entities:
|
||||||
|
word = ent["word"]
|
||||||
|
if self._is_whitelisted(word):
|
||||||
|
continue
|
||||||
|
if self._is_establishment(word):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Vérifier si déjà anonymisé (contient des crochets)
|
||||||
|
if "[" in word and "]" in word:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pseudo = self.registry.get_replacement(word)
|
||||||
|
if pseudo is None:
|
||||||
|
pseudo = self.registry.register(word, "personne")
|
||||||
|
|
||||||
|
text = text[:ent["start"]] + pseudo + text[ent["end"]:]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
self.report.entities_found.append({
|
||||||
|
"original": word,
|
||||||
|
"replacement": pseudo,
|
||||||
|
"source": "ner",
|
||||||
|
"score": ent["score"],
|
||||||
|
})
|
||||||
|
|
||||||
|
self.report.ner_replacements = count
|
||||||
|
return text
|
||||||
|
|
||||||
|
# --- Phase 3 : Balayage final ---
|
||||||
|
|
||||||
|
def _phase3_sweep(self, text: str) -> str:
|
||||||
|
"""Balayage brute-force des entités connues restantes."""
|
||||||
|
count = 0
|
||||||
|
all_entities = self.registry.get_all_entities()
|
||||||
|
|
||||||
|
for original, replacement in sorted(
|
||||||
|
all_entities.items(), key=lambda x: len(x[0]), reverse=True
|
||||||
|
):
|
||||||
|
if len(original) < 3:
|
||||||
|
continue
|
||||||
|
if self._is_whitelisted(original):
|
||||||
|
continue
|
||||||
|
|
||||||
|
# Recherche insensible à la casse, avec frontières de mots
|
||||||
|
escaped = re.escape(original)
|
||||||
|
pattern = re.compile(r"\b" + escaped + r"\b", re.IGNORECASE)
|
||||||
|
matches = pattern.findall(text)
|
||||||
|
if matches:
|
||||||
|
text = pattern.sub(replacement, text)
|
||||||
|
count += len(matches)
|
||||||
|
|
||||||
|
self.report.sweep_replacements = count
|
||||||
|
return text
|
||||||
|
|
||||||
|
# --- Helpers ---
|
||||||
|
|
||||||
|
def _register_parsed_entities(self) -> None:
|
||||||
|
"""Pré-enregistre les entités extraites par les parsers."""
|
||||||
|
patient = self._parsed.get("patient", {})
|
||||||
|
|
||||||
|
# Noms patient
|
||||||
|
for key in ("nom_prenom", "nom_naissance", "nom_complet"):
|
||||||
|
if patient.get(key):
|
||||||
|
self.registry.register(patient[key], "patient")
|
||||||
|
|
||||||
|
# Adresse patient — enregistrer l'adresse complète et chaque mot significatif
|
||||||
|
if patient.get("adresse"):
|
||||||
|
self._register_address(patient["adresse"])
|
||||||
|
if patient.get("ville"):
|
||||||
|
self.registry.register(patient["ville"], "adresse")
|
||||||
|
if patient.get("code_postal"):
|
||||||
|
cp = patient["code_postal"]
|
||||||
|
if patient.get("ville"):
|
||||||
|
self.registry.register(f"{cp} {patient['ville']}", "adresse")
|
||||||
|
if patient.get("lieu_naissance"):
|
||||||
|
self.registry.register(patient["lieu_naissance"], "lieu_naissance")
|
||||||
|
|
||||||
|
# Médecins
|
||||||
|
for med in self._parsed.get("medecins", []):
|
||||||
|
self.registry.register(med, "medecin")
|
||||||
|
|
||||||
|
# Scanner le texte brut pour les lignes d'adresse non captées par le parser
|
||||||
|
raw_text = self._parsed.get("contenu_medical", "")
|
||||||
|
# Pas disponible ici, on le fera via les patterns dans phase 1
|
||||||
|
|
||||||
|
# Contacts
|
||||||
|
for contact in self._parsed.get("contacts", []):
|
||||||
|
# Extraire les noms des contacts
|
||||||
|
names = re.findall(
|
||||||
|
r"([A-ZÉÈÊËÀÂa-zéèêëàâ]{2,}(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ]{2,})+)",
|
||||||
|
contact,
|
||||||
|
)
|
||||||
|
for name in names:
|
||||||
|
if not self._is_whitelisted(name):
|
||||||
|
self.registry.register(name, "contact")
|
||||||
|
|
||||||
|
def _replace_pattern(
|
||||||
|
self,
|
||||||
|
text: str,
|
||||||
|
pattern: regex_mod.Pattern,
|
||||||
|
category: str,
|
||||||
|
group_handler: Any = None,
|
||||||
|
skip_establishment_check: bool = False,
|
||||||
|
) -> tuple[str, int]:
|
||||||
|
"""Remplace les matches d'un pattern."""
|
||||||
|
count = 0
|
||||||
|
for m in reversed(list(pattern.finditer(text))):
|
||||||
|
if group_handler:
|
||||||
|
matched_text = group_handler(m)
|
||||||
|
else:
|
||||||
|
matched_text = m.group(1) if m.lastindex else m.group(0)
|
||||||
|
|
||||||
|
if not matched_text:
|
||||||
|
continue
|
||||||
|
|
||||||
|
if not skip_establishment_check and self._is_establishment(matched_text):
|
||||||
|
continue
|
||||||
|
|
||||||
|
pseudo = self.registry.register(matched_text, category)
|
||||||
|
|
||||||
|
# Trouver le bon span à remplacer
|
||||||
|
if group_handler:
|
||||||
|
# Pour les multi-group, trouver quel groupe a matché
|
||||||
|
for i in range(1, (m.lastindex or 0) + 1):
|
||||||
|
if m.group(i) == matched_text:
|
||||||
|
start, end = m.span(i)
|
||||||
|
break
|
||||||
|
else:
|
||||||
|
start, end = m.span()
|
||||||
|
elif m.lastindex:
|
||||||
|
start, end = m.span(1)
|
||||||
|
else:
|
||||||
|
start, end = m.span()
|
||||||
|
|
||||||
|
text = text[:start] + pseudo + text[end:]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
self.report.entities_found.append({
|
||||||
|
"original": matched_text,
|
||||||
|
"replacement": pseudo,
|
||||||
|
"source": "regex",
|
||||||
|
"category": category,
|
||||||
|
})
|
||||||
|
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _handle_multi_group(self, m: regex_mod.Match) -> str | None:
|
||||||
|
"""Gère les patterns avec plusieurs groupes alternatifs."""
|
||||||
|
for i in range(1, (m.lastindex or 0) + 1):
|
||||||
|
if m.group(i):
|
||||||
|
return m.group(i)
|
||||||
|
return None
|
||||||
|
|
||||||
|
def _replace_crh_footer_ipp_episode(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les IPP/épisode dans les footers CRH (format combiné)."""
|
||||||
|
count = 0
|
||||||
|
for m in reversed(list(patterns.CRH_FOOTER_IPP_EPISODE.finditer(text))):
|
||||||
|
ipp = m.group(1)
|
||||||
|
episode = m.group(2)
|
||||||
|
pseudo_ipp = self.registry.register(ipp, "ipp")
|
||||||
|
pseudo_ep = self.registry.register(episode, "episode")
|
||||||
|
replacement = f"IPP {pseudo_ipp} / N° Episode {pseudo_ep}"
|
||||||
|
text = text[:m.start()] + replacement + text[m.end():]
|
||||||
|
count += 2
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_phone(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les numéros de téléphone."""
|
||||||
|
count = 0
|
||||||
|
for m in reversed(list(patterns.PHONE_PATTERN.finditer(text))):
|
||||||
|
phone = m.group(0)
|
||||||
|
# Ne pas anonymiser le standard de l'hôpital si configuré
|
||||||
|
normalized = phone.replace(".", " ").replace("-", " ")
|
||||||
|
if KEEP_ESTABLISHMENT_NAME and "05 59 44 35 35" in normalized:
|
||||||
|
continue
|
||||||
|
pseudo = self.registry.register(phone, "telephone")
|
||||||
|
text = text[:m.start()] + pseudo + text[m.end():]
|
||||||
|
count += 1
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_fax(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les numéros de fax."""
|
||||||
|
count = 0
|
||||||
|
for m in reversed(list(patterns.FAX_PATTERN.finditer(text))):
|
||||||
|
fax_num = m.group(1)
|
||||||
|
pseudo = self.registry.register(fax_num, "telephone")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_addresses(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les adresses."""
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
# Lignes d'adresse
|
||||||
|
for m in reversed(list(patterns.ADDRESS_LINE_PATTERN.finditer(text))):
|
||||||
|
addr = m.group(1).strip()
|
||||||
|
if len(addr) > 5 and not self._is_establishment(addr):
|
||||||
|
pseudo = self.registry.register(addr, "adresse")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Code postal + ville (sauf l'hôpital / Bayonne)
|
||||||
|
for m in reversed(list(patterns.CP_VILLE_PATTERN.finditer(text))):
|
||||||
|
ville = m.group(2).strip()
|
||||||
|
cp = m.group(1)
|
||||||
|
full = f"{cp} {ville}"
|
||||||
|
if self._is_establishment(full) or "BAYONNE" in ville.upper():
|
||||||
|
if not KEEP_ESTABLISHMENT_NAME:
|
||||||
|
pseudo = self.registry.register(full, "adresse")
|
||||||
|
text = text[:m.start()] + pseudo + text[m.end():]
|
||||||
|
count += 1
|
||||||
|
else:
|
||||||
|
pseudo = self.registry.register(full, "adresse")
|
||||||
|
text = text[:m.start()] + pseudo + text[m.end():]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_inline_addresses(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Capture les adresses inline (MAISON xxx, QUARTIER xxx, LOTISSEMENT xxx)."""
|
||||||
|
count = 0
|
||||||
|
# Pattern : MAISON/QUARTIER/LOTISSEMENT suivi de mots (noms propres de lieux)
|
||||||
|
inline_addr = re.compile(
|
||||||
|
r"((?:MAISON|QUARTIER|LOTISSEMENT|RESIDENCE|HAMEAU)\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s]+?)(?=\n|$|Dr|\d{5}|Chef|médical|coordonnateur)",
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
for m in reversed(list(inline_addr.finditer(text))):
|
||||||
|
addr = m.group(1).strip()
|
||||||
|
if len(addr) > 5:
|
||||||
|
self._register_address(addr)
|
||||||
|
pseudo = self.registry.register(addr, "adresse")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_date_naissance(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les dates de naissance."""
|
||||||
|
count = 0
|
||||||
|
for m in reversed(list(patterns.DATE_NAISSANCE_PATTERN.finditer(text))):
|
||||||
|
date_str = m.group(1)
|
||||||
|
pseudo = self.registry.register(date_str, "date_naissance")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_structured_names(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les noms détectés par patterns structurels."""
|
||||||
|
count = 0
|
||||||
|
|
||||||
|
# CRH footer patient : "Patient(e) : NOM PRENOM Né(e)"
|
||||||
|
for m in reversed(list(patterns.CRH_FOOTER_PATIENT_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "patient")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Patient names
|
||||||
|
for pat in [patterns.PATIENT_NAME_PATTERN, patterns.CIVILITE_NAME_PATTERN]:
|
||||||
|
for m in reversed(list(pat.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "patient")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Doctor names (tous les patterns)
|
||||||
|
for pat in [patterns.DR_NAME_PATTERN, patterns.MEDECIN_COURANT_PATTERN,
|
||||||
|
patterns.MEDECIN_TRAITANT_PATTERN, patterns.MEDECIN_PEC_PATTERN]:
|
||||||
|
for m in reversed(list(pat.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "medecin")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Note authors (with date suffix)
|
||||||
|
for m in reversed(list(patterns.NOTE_AUTHOR_DATE_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "soignant")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Note authors (Prénom NOM pattern, sans date)
|
||||||
|
for m in reversed(list(patterns.NOTE_AUTHOR_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "soignant")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# IAO
|
||||||
|
for m in reversed(list(patterns.IAO_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "soignant")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Rédigé par
|
||||||
|
for m in reversed(list(patterns.REDIGE_PAR_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "soignant")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
# Staff names from header
|
||||||
|
for m in reversed(list(patterns.STAFF_NAME_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip() if m.group(1) else ""
|
||||||
|
if len(name) >= 3 and not self._is_whitelisted(name):
|
||||||
|
pseudo = self.registry.register(name, "soignant")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
|
||||||
|
self.report.regex_replacements += count
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _replace_footer(self, text: str) -> tuple[str, int]:
|
||||||
|
"""Remplace les infos patient dans les footers (Trackare et CRH)."""
|
||||||
|
count = 0
|
||||||
|
for m in reversed(list(patterns.FOOTER_PATIENT_PATTERN.finditer(text))):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
pseudo = self.registry.register(name, "patient")
|
||||||
|
text = text[:m.start(1)] + pseudo + text[m.end(1):]
|
||||||
|
count += 1
|
||||||
|
return text, count
|
||||||
|
|
||||||
|
def _register_address(self, addr: str) -> None:
|
||||||
|
"""Enregistre une adresse et ses mots significatifs."""
|
||||||
|
self.registry.register(addr, "adresse")
|
||||||
|
skip_words = {
|
||||||
|
"maison", "quartier", "lotissement", "rue", "avenue",
|
||||||
|
"boulevard", "chemin", "place", "route", "résidence",
|
||||||
|
"hameau", "lieu", "dit", "impasse", "allée", "batiment",
|
||||||
|
"bp", "cedex",
|
||||||
|
}
|
||||||
|
for word in addr.split():
|
||||||
|
word_clean = word.strip(",.")
|
||||||
|
if len(word_clean) >= 4 and word_clean.lower() not in skip_words:
|
||||||
|
self.registry.register(word_clean, "adresse")
|
||||||
|
|
||||||
|
def _is_whitelisted(self, text: str) -> bool:
|
||||||
|
"""Vérifie si un terme est dans la whitelist médicale."""
|
||||||
|
return text.lower().strip() in MEDICAL_TERMS_WHITELIST
|
||||||
|
|
||||||
|
def _is_establishment(self, text: str) -> bool:
|
||||||
|
"""Vérifie si le texte fait référence à l'établissement."""
|
||||||
|
if not KEEP_ESTABLISHMENT_NAME:
|
||||||
|
return False
|
||||||
|
text_lower = text.lower().strip()
|
||||||
|
return any(est in text_lower for est in ESTABLISHMENT_NAMES)
|
||||||
86
src/anonymization/entity_registry.py
Normal file
86
src/anonymization/entity_registry.py
Normal file
@@ -0,0 +1,86 @@
|
|||||||
|
"""Registre d'entités pour assurer la cohérence des remplacements."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
|
|
||||||
|
class EntityRegistry:
|
||||||
|
"""Maintient un mapping cohérent entre entités réelles et pseudonymes."""
|
||||||
|
|
||||||
|
def __init__(self, whitelist: set[str] | None = None):
|
||||||
|
self._counters: dict[str, int] = defaultdict(int)
|
||||||
|
self._mappings: dict[str, str] = {}
|
||||||
|
self._category_map: dict[str, str] = {}
|
||||||
|
self._whitelist: set[str] = whitelist or set()
|
||||||
|
|
||||||
|
def register(self, entity: str, category: str) -> str:
|
||||||
|
"""Enregistre une entité et retourne son pseudonyme.
|
||||||
|
|
||||||
|
Si l'entité est déjà connue, retourne le même pseudonyme.
|
||||||
|
"""
|
||||||
|
key = self._normalize(entity)
|
||||||
|
if not key:
|
||||||
|
return entity
|
||||||
|
|
||||||
|
if key in self._mappings:
|
||||||
|
return self._mappings[key]
|
||||||
|
|
||||||
|
self._counters[category] += 1
|
||||||
|
count = self._counters[category]
|
||||||
|
|
||||||
|
pseudo = self._generate_pseudo(category, count)
|
||||||
|
self._mappings[key] = pseudo
|
||||||
|
self._category_map[key] = category
|
||||||
|
|
||||||
|
# Enregistrer aussi les sous-parties du nom (sauf termes médicaux)
|
||||||
|
parts = key.split()
|
||||||
|
if len(parts) > 1:
|
||||||
|
for part in parts:
|
||||||
|
if len(part) >= 3 and part not in self._whitelist:
|
||||||
|
part_key = part
|
||||||
|
if part_key not in self._mappings:
|
||||||
|
self._mappings[part_key] = f"[{category.upper()}]"
|
||||||
|
|
||||||
|
return pseudo
|
||||||
|
|
||||||
|
def get_replacement(self, entity: str) -> str | None:
|
||||||
|
"""Retourne le pseudonyme d'une entité connue, ou None."""
|
||||||
|
key = self._normalize(entity)
|
||||||
|
return self._mappings.get(key)
|
||||||
|
|
||||||
|
def get_all_entities(self) -> dict[str, str]:
|
||||||
|
"""Retourne tous les mappings entity → pseudo."""
|
||||||
|
return dict(self._mappings)
|
||||||
|
|
||||||
|
def get_all_original_names(self) -> list[str]:
|
||||||
|
"""Retourne toutes les entités originales (noms avant normalisation)."""
|
||||||
|
return list(self._mappings.keys())
|
||||||
|
|
||||||
|
def _normalize(self, text: str) -> str:
|
||||||
|
"""Normalise un nom pour lookup : minuscules, espaces simplifiés."""
|
||||||
|
text = text.strip()
|
||||||
|
text = re.sub(r"\s+", " ", text)
|
||||||
|
return text.lower()
|
||||||
|
|
||||||
|
def _generate_pseudo(self, category: str, count: int) -> str:
|
||||||
|
"""Génère un pseudonyme selon la catégorie."""
|
||||||
|
labels = {
|
||||||
|
"patient": f"[PATIENT_{count}]",
|
||||||
|
"medecin": f"[MEDECIN_{count}]",
|
||||||
|
"soignant": f"[SOIGNANT_{count}]",
|
||||||
|
"contact": f"[CONTACT_{count}]",
|
||||||
|
"personne": f"[PERSONNE_{count}]",
|
||||||
|
"ipp": f"[IPP_{count}]",
|
||||||
|
"episode": f"[EPISODE_{count}]",
|
||||||
|
"nir": f"[NIR_{count}]",
|
||||||
|
"telephone": f"[TEL_{count}]",
|
||||||
|
"email": f"[EMAIL_{count}]",
|
||||||
|
"adresse": f"[ADRESSE_{count}]",
|
||||||
|
"date_naissance": f"[DATE_NAISS_{count}]",
|
||||||
|
"lieu_naissance": f"[LIEU_NAISS_{count}]",
|
||||||
|
"finess": f"[FINESS]",
|
||||||
|
"code_barre": f"[CODE_BARRE_{count}]",
|
||||||
|
}
|
||||||
|
return labels.get(category, f"[{category.upper()}_{count}]")
|
||||||
95
src/anonymization/ner_anonymizer.py
Normal file
95
src/anonymization/ner_anonymizer.py
Normal file
@@ -0,0 +1,95 @@
|
|||||||
|
"""NER via CamemBERT pour détecter les noms en texte libre."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from typing import TYPE_CHECKING
|
||||||
|
|
||||||
|
from ..config import NER_CONFIDENCE_THRESHOLD, NER_MODEL
|
||||||
|
|
||||||
|
if TYPE_CHECKING:
|
||||||
|
from transformers import Pipeline
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_pipeline: Pipeline | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _get_pipeline() -> Pipeline:
|
||||||
|
"""Charge le modèle NER (lazy loading)."""
|
||||||
|
global _pipeline
|
||||||
|
if _pipeline is None:
|
||||||
|
logger.info("Chargement du modèle NER %s...", NER_MODEL)
|
||||||
|
from transformers import AutoModelForTokenClassification, AutoTokenizer, pipeline
|
||||||
|
|
||||||
|
tokenizer = AutoTokenizer.from_pretrained(NER_MODEL)
|
||||||
|
model = AutoModelForTokenClassification.from_pretrained(NER_MODEL)
|
||||||
|
_pipeline = pipeline(
|
||||||
|
"ner",
|
||||||
|
model=model,
|
||||||
|
tokenizer=tokenizer,
|
||||||
|
aggregation_strategy="simple",
|
||||||
|
)
|
||||||
|
logger.info("Modèle NER chargé.")
|
||||||
|
return _pipeline
|
||||||
|
|
||||||
|
|
||||||
|
def extract_person_entities(text: str) -> list[dict]:
|
||||||
|
"""Extrait les entités de type PER (personnes) du texte.
|
||||||
|
|
||||||
|
Retourne une liste de dicts avec 'word', 'start', 'end', 'score'.
|
||||||
|
"""
|
||||||
|
pipe = _get_pipeline()
|
||||||
|
|
||||||
|
# CamemBERT a une limite de tokens — découper en chunks
|
||||||
|
chunks = _split_text(text, max_chars=500)
|
||||||
|
entities: list[dict] = []
|
||||||
|
offset = 0
|
||||||
|
|
||||||
|
for chunk in chunks:
|
||||||
|
results = pipe(chunk)
|
||||||
|
for ent in results:
|
||||||
|
if ent["entity_group"] == "PER" and ent["score"] >= NER_CONFIDENCE_THRESHOLD:
|
||||||
|
word = ent["word"].strip()
|
||||||
|
if len(word) >= 2:
|
||||||
|
entities.append({
|
||||||
|
"word": word,
|
||||||
|
"start": ent["start"] + offset,
|
||||||
|
"end": ent["end"] + offset,
|
||||||
|
"score": float(ent["score"]),
|
||||||
|
})
|
||||||
|
offset += len(chunk)
|
||||||
|
|
||||||
|
return _deduplicate(entities)
|
||||||
|
|
||||||
|
|
||||||
|
def _split_text(text: str, max_chars: int = 500) -> list[str]:
|
||||||
|
"""Découpe le texte en chunks de taille raisonnable aux limites de phrases."""
|
||||||
|
if len(text) <= max_chars:
|
||||||
|
return [text]
|
||||||
|
|
||||||
|
chunks: list[str] = []
|
||||||
|
start = 0
|
||||||
|
while start < len(text):
|
||||||
|
end = start + max_chars
|
||||||
|
if end < len(text):
|
||||||
|
# Chercher la fin de phrase la plus proche
|
||||||
|
for sep in ["\n", ". ", ", ", " "]:
|
||||||
|
pos = text.rfind(sep, start, end)
|
||||||
|
if pos > start:
|
||||||
|
end = pos + len(sep)
|
||||||
|
break
|
||||||
|
chunks.append(text[start:end])
|
||||||
|
start = end
|
||||||
|
|
||||||
|
return chunks
|
||||||
|
|
||||||
|
|
||||||
|
def _deduplicate(entities: list[dict]) -> list[dict]:
|
||||||
|
"""Déduplique les entités par mot (garde le score le plus élevé)."""
|
||||||
|
seen: dict[str, dict] = {}
|
||||||
|
for ent in entities:
|
||||||
|
key = ent["word"].lower()
|
||||||
|
if key not in seen or ent["score"] > seen[key]["score"]:
|
||||||
|
seen[key] = ent
|
||||||
|
return list(seen.values())
|
||||||
194
src/anonymization/regex_patterns.py
Normal file
194
src/anonymization/regex_patterns.py
Normal file
@@ -0,0 +1,194 @@
|
|||||||
|
"""Patterns regex pour la détection de données personnelles dans les documents médicaux FR."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import regex
|
||||||
|
|
||||||
|
# --- Identifiants ---
|
||||||
|
|
||||||
|
# IPP : séquence de 6-10 chiffres après "IPP" (avec ou sans :)
|
||||||
|
IPP_PATTERN = regex.compile(
|
||||||
|
r"(?:IPP\s*[:=]?\s*)(\d{6,10})"
|
||||||
|
r"|"
|
||||||
|
r"\((\d{8})\s*\)", # Footer "(01306172 )"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Numéro d'épisode (toutes les variantes)
|
||||||
|
EPISODE_PATTERN = regex.compile(
|
||||||
|
r"(?:Episode\s*(?:No|N°|N\.?)\s*[:=]?\s*)(\d{6,10})"
|
||||||
|
r"|"
|
||||||
|
r"(?:N°\s*Episode\s+)(\d{6,10})",
|
||||||
|
)
|
||||||
|
|
||||||
|
# NIR / Numéro de sécurité sociale (15 chiffres)
|
||||||
|
NIR_PATTERN = regex.compile(r"\b([12]\d{2}(?:0[1-9]|1[0-2])\d{2,3}\d{6}\s?\d{2})\b")
|
||||||
|
|
||||||
|
# FINESS (9 chiffres, souvent précédé de "Finess")
|
||||||
|
FINESS_PATTERN = regex.compile(r"(?:Finess|FINESS)\s*[:\s]*\*?(\d{9})\*?")
|
||||||
|
|
||||||
|
# RPPS (11 chiffres)
|
||||||
|
RPPS_PATTERN = regex.compile(r"RPPS\s*[:=]?\s*(\d{11})")
|
||||||
|
|
||||||
|
# Code-barres (nombre entre astérisques)
|
||||||
|
BARCODE_PATTERN = regex.compile(r"\*(\d{9,15})\*")
|
||||||
|
|
||||||
|
# Numéro isolé après code-barres (même numéro répété sans astérisques)
|
||||||
|
BARCODE_REPEAT_PATTERN = regex.compile(r"\*\d{9,15}\*\s*\n(\d{9,15})")
|
||||||
|
|
||||||
|
# --- Contact ---
|
||||||
|
|
||||||
|
# Téléphones FR : 10 chiffres avec séparateurs variés
|
||||||
|
PHONE_PATTERN = regex.compile(
|
||||||
|
r"\b(0[1-9])[\s.\-]?(\d{2})[\s.\-]?(\d{2})[\s.\-]?(\d{2})[\s.\-]?(\d{2})\b"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Emails (y compris @ch-cotebasque.fr qui contiennent des initiales de soignants)
|
||||||
|
EMAIL_PATTERN = regex.compile(
|
||||||
|
r"\b[a-zA-Z0-9._%+\-]+@[a-zA-Z0-9.\-]+\.[a-zA-Z]{2,}\b"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Fax
|
||||||
|
FAX_PATTERN = regex.compile(
|
||||||
|
r"Fax\s*:\s*(0[1-9][\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2}[\s.\-]?\d{2})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Adresses ---
|
||||||
|
|
||||||
|
# Code postal + ville (uniquement les ALL_CAPS après 5 digits)
|
||||||
|
CP_VILLE_PATTERN = regex.compile(
|
||||||
|
r"\b(\d{5})\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\s\-]{2,})\b"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lignes d'adresse avec mots-clés (y compris noms propres basques/locaux)
|
||||||
|
ADDRESS_LINE_PATTERN = regex.compile(
|
||||||
|
r"^((?:(?:\d+\s*,?\s*)?(?:MAISON|LOTISSEMENT|QUARTIER|RUE|AVENUE|BOULEVARD|IMPASSE|CHEMIN|PLACE|ALLEE|ALLÉE|ROUTE|LIEU[\s-]DIT|RESIDENCE|RÉSIDENCE|BATIMENT|BÂTIMENT|HAMEAU)[\s\w\-''ÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ]+))$",
|
||||||
|
regex.MULTILINE | regex.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Adresse complète multi-ligne (après nom patient dans CRH/Trackare)
|
||||||
|
ADDRESS_BLOCK_PATTERN = regex.compile(
|
||||||
|
r"(?:Adresse\s*:\s*)(.+?)(?:\s+Ville|\n)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Dates de naissance ---
|
||||||
|
|
||||||
|
# Toutes les variantes : "né(e) le", "née le", "né le", "Né(e) le", "Date de naissance:"
|
||||||
|
DATE_NAISSANCE_PATTERN = regex.compile(
|
||||||
|
r"(?:[Nn][ée]+(?:\(e\))?\s+le\s+|Date de naissance\s*[:=]?\s*)(\d{2}/\d{2}/\d{4})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# --- Noms structurés ---
|
||||||
|
|
||||||
|
# Footer CRH : "Patient(e) : NOM PRENOM Né(e) le"
|
||||||
|
CRH_FOOTER_PATIENT_PATTERN = regex.compile(
|
||||||
|
r"Patient(?:\(e\))?\s*:\s*([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+?)\s+(?:Né|né)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Footer CRH : "IPP NNNNNNNN / N° Episode NNNNNNNN"
|
||||||
|
CRH_FOOTER_IPP_EPISODE = regex.compile(
|
||||||
|
r"IPP\s+(\d{6,10})\s*/\s*N°\s*Episode\s+(\d{6,10})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Après "Nom de naissance:", "Nom et Prénom:", "Patient(e):"
|
||||||
|
PATIENT_NAME_PATTERN = regex.compile(
|
||||||
|
r"(?:Patient(?:\(e\))?\s*:\s*|Nom de naissance\s*:\s*|Nom et Prénom\s*:\s*)"
|
||||||
|
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+)",
|
||||||
|
)
|
||||||
|
|
||||||
|
# "MME/Mme/M./MR/Madame/Monsieur" suivi du nom
|
||||||
|
CIVILITE_NAME_PATTERN = regex.compile(
|
||||||
|
r"(?:MME|Mme|Madame|M\.|Mr|MR|Monsieur)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\.\-]+?)(?:\s+[Nn]é|\s+Date|\n|,)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# "DR." / "Dr" / "Docteur" suivi du nom du médecin
|
||||||
|
DR_NAME_PATTERN = regex.compile(
|
||||||
|
r"(?:DR\.?|Dr\.?|Docteur)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+){0,2})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# "Rédigé par" en pied de page CRH
|
||||||
|
REDIGE_PAR_PATTERN = regex.compile(
|
||||||
|
r"Rédigé par\s*:?\s*(.+?)(?:\n|$)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# "Liste des destinataires:" suivi de noms
|
||||||
|
DESTINATAIRE_PATTERN = regex.compile(
|
||||||
|
r"(?:Madame|Monsieur|DR\.?|Dr\.?)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\.\-]+?)(?:\n|$)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Noms d'auteurs dans Trackare : "Note d'évolution Prénom NOM DD/MM/YYYY"
|
||||||
|
NOTE_AUTHOR_DATE_PATTERN = regex.compile(
|
||||||
|
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||||
|
r"(?:DR\.?\s+)?"
|
||||||
|
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)+)"
|
||||||
|
r"\s+\d{2}/\d{2}/\d{4}",
|
||||||
|
)
|
||||||
|
|
||||||
|
# Noms d'auteurs Trackare sans date immédiate : "Note IDE Prénom NOM texte..."
|
||||||
|
# Le nom est toujours un Prénom (Capitalized) suivi d'un NOM (ALL CAPS)
|
||||||
|
NOTE_AUTHOR_PATTERN = regex.compile(
|
||||||
|
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||||
|
r"(?:DR\.?\s+)?"
|
||||||
|
r"([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][a-zéèêëàâäùûüôöîïç]+\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\-]{2,})"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Footer Trackare : "Patient: NOM PRENOM - Date de naissance: ..."
|
||||||
|
FOOTER_PATIENT_PATTERN = regex.compile(
|
||||||
|
r"Patient\s*:\s*([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\s\-]+?)\s*-\s*Date de naissance"
|
||||||
|
)
|
||||||
|
|
||||||
|
# "Médecin traitant" block
|
||||||
|
MEDECIN_TRAITANT_PATTERN = regex.compile(
|
||||||
|
r"Médecin traitant\s*\n\s*(?:Nom\s+Adresse\s+.*\n)?\s*(?:DR\.?\s+)?(.+?)(?:\s+(?:Lotissement|Rue|Avenue|\d{5}))",
|
||||||
|
regex.IGNORECASE,
|
||||||
|
)
|
||||||
|
|
||||||
|
# "Médecin courant:"
|
||||||
|
MEDECIN_COURANT_PATTERN = regex.compile(
|
||||||
|
r"Médecin courant\s*:\s*(?:DR\.?\s+)?([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# "Médecin de la prise en charge médicale NOM"
|
||||||
|
MEDECIN_PEC_PATTERN = regex.compile(
|
||||||
|
r"(?:Médecin de (?:la )?(?:prise en charge|décision)\s+médicale)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# IAO
|
||||||
|
IAO_PATTERN = regex.compile(
|
||||||
|
r"IAO\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\.\-]+)*)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Cadre / personnel nommé dans l'en-tête CRH
|
||||||
|
STAFF_NAME_PATTERN = regex.compile(
|
||||||
|
r"(?:Mme|M\.)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\.\-]+\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\.\-]+)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Lieu de naissance
|
||||||
|
LIEU_NAISSANCE_PATTERN = regex.compile(
|
||||||
|
r"Lieu de naissance\s*:\s*(.+?)(?:\n|$)"
|
||||||
|
)
|
||||||
|
|
||||||
|
# Auteurs de prescription dans Trackare
|
||||||
|
PRESCRIPTION_AUTHOR_PATTERN = regex.compile(
|
||||||
|
r"(?:Presc\.\s*de\s*Sortie|Normal|Signé|Arrêté|Réalisé)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][a-zéèêëàâäùûüôöîïç]+(?:\s+[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-Za-zéèêëàâäùûüôöîïç\-]+)+)"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
def get_all_name_patterns():
|
||||||
|
"""Retourne la liste des patterns qui capturent des noms de personnes."""
|
||||||
|
return [
|
||||||
|
PATIENT_NAME_PATTERN,
|
||||||
|
CIVILITE_NAME_PATTERN,
|
||||||
|
DR_NAME_PATTERN,
|
||||||
|
REDIGE_PAR_PATTERN,
|
||||||
|
NOTE_AUTHOR_DATE_PATTERN,
|
||||||
|
NOTE_AUTHOR_PATTERN,
|
||||||
|
FOOTER_PATIENT_PATTERN,
|
||||||
|
CRH_FOOTER_PATIENT_PATTERN,
|
||||||
|
MEDECIN_TRAITANT_PATTERN,
|
||||||
|
MEDECIN_COURANT_PATTERN,
|
||||||
|
MEDECIN_PEC_PATTERN,
|
||||||
|
IAO_PATTERN,
|
||||||
|
STAFF_NAME_PATTERN,
|
||||||
|
DESTINATAIRE_PATTERN,
|
||||||
|
PRESCRIPTION_AUTHOR_PATTERN,
|
||||||
|
]
|
||||||
99
src/config.py
Normal file
99
src/config.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
"""Configuration globale et modèles de données pour le pipeline T2A."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from pydantic import BaseModel, Field
|
||||||
|
|
||||||
|
|
||||||
|
# --- Chemins ---
|
||||||
|
|
||||||
|
BASE_DIR = Path(__file__).resolve().parent.parent
|
||||||
|
INPUT_DIR = BASE_DIR / "input"
|
||||||
|
OUTPUT_DIR = BASE_DIR / "output"
|
||||||
|
ANONYMIZED_DIR = OUTPUT_DIR / "anonymized"
|
||||||
|
STRUCTURED_DIR = OUTPUT_DIR / "structured"
|
||||||
|
REPORTS_DIR = OUTPUT_DIR / "reports"
|
||||||
|
|
||||||
|
for d in (INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR, REPORTS_DIR):
|
||||||
|
d.mkdir(parents=True, exist_ok=True)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Configuration anonymisation ---
|
||||||
|
|
||||||
|
KEEP_ESTABLISHMENT_NAME = True
|
||||||
|
NER_MODEL = "Jean-Baptiste/camembert-ner"
|
||||||
|
NER_CONFIDENCE_THRESHOLD = 0.80
|
||||||
|
|
||||||
|
|
||||||
|
# --- Modèles de données CIM-10 ---
|
||||||
|
|
||||||
|
|
||||||
|
class Sejour(BaseModel):
|
||||||
|
sexe: Optional[str] = None
|
||||||
|
age: Optional[int] = None
|
||||||
|
date_entree: Optional[str] = None
|
||||||
|
date_sortie: Optional[str] = None
|
||||||
|
duree_sejour: Optional[int] = None
|
||||||
|
mode_entree: Optional[str] = None
|
||||||
|
mode_sortie: Optional[str] = None
|
||||||
|
imc: Optional[float] = None
|
||||||
|
poids: Optional[float] = None
|
||||||
|
taille: Optional[float] = None
|
||||||
|
|
||||||
|
|
||||||
|
class Diagnostic(BaseModel):
|
||||||
|
texte: str
|
||||||
|
cim10_suggestion: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class ActeCCAM(BaseModel):
|
||||||
|
texte: str
|
||||||
|
code_ccam_suggestion: Optional[str] = None
|
||||||
|
date: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class Traitement(BaseModel):
|
||||||
|
medicament: str
|
||||||
|
posologie: Optional[str] = None
|
||||||
|
code_atc: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class BiologieCle(BaseModel):
|
||||||
|
test: str
|
||||||
|
valeur: Optional[str] = None
|
||||||
|
anomalie: Optional[bool] = None
|
||||||
|
|
||||||
|
|
||||||
|
class Imagerie(BaseModel):
|
||||||
|
type: str
|
||||||
|
conclusion: Optional[str] = None
|
||||||
|
score: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
class DossierMedical(BaseModel):
|
||||||
|
source_file: str = ""
|
||||||
|
document_type: str = ""
|
||||||
|
sejour: Sejour = Field(default_factory=Sejour)
|
||||||
|
diagnostic_principal: Optional[Diagnostic] = None
|
||||||
|
diagnostics_associes: list[Diagnostic] = Field(default_factory=list)
|
||||||
|
actes_ccam: list[ActeCCAM] = Field(default_factory=list)
|
||||||
|
antecedents: list[str] = Field(default_factory=list)
|
||||||
|
traitements_sortie: list[Traitement] = Field(default_factory=list)
|
||||||
|
biologie_cle: list[BiologieCle] = Field(default_factory=list)
|
||||||
|
imagerie: list[Imagerie] = Field(default_factory=list)
|
||||||
|
complications: list[str] = Field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
# --- Rapport d'anonymisation ---
|
||||||
|
|
||||||
|
|
||||||
|
class AnonymizationReport(BaseModel):
|
||||||
|
source_file: str
|
||||||
|
total_replacements: int = 0
|
||||||
|
regex_replacements: int = 0
|
||||||
|
ner_replacements: int = 0
|
||||||
|
sweep_replacements: int = 0
|
||||||
|
entities_found: list[dict] = Field(default_factory=list)
|
||||||
0
src/extraction/__init__.py
Normal file
0
src/extraction/__init__.py
Normal file
129
src/extraction/crh_parser.py
Normal file
129
src/extraction/crh_parser.py
Normal file
@@ -0,0 +1,129 @@
|
|||||||
|
"""Parsing des Comptes Rendus d'Hospitalisation (CRH)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def parse_crh(text: str) -> dict:
|
||||||
|
"""Parse un CRH et retourne les sections structurées."""
|
||||||
|
result: dict = {
|
||||||
|
"type": "crh",
|
||||||
|
"patient": {},
|
||||||
|
"sejour": {},
|
||||||
|
"medecins": [],
|
||||||
|
"contenu_medical": "",
|
||||||
|
"sections": {},
|
||||||
|
}
|
||||||
|
|
||||||
|
_extract_patient_info(text, result)
|
||||||
|
_extract_sejour_info(text, result)
|
||||||
|
_extract_medecins(text, result)
|
||||||
|
_extract_medical_content(text, result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_patient_info(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les informations patient du CRH."""
|
||||||
|
# "MME NARBAIS AUDREY" ou "M. NOM PRENOM"
|
||||||
|
m = re.search(
|
||||||
|
r"(?:MME|M\.|MR)\s+([A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ][A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇa-zéèêëàâäùûüôöîïç\- ]+)",
|
||||||
|
text[:2000],
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["patient"]["nom_complet"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# Adresse sous le nom patient — capturer les lignes entre le nom et le CP+Ville
|
||||||
|
addr_match = re.search(
|
||||||
|
r"(?:MME|M\.|MR|Madame|Monsieur)\s+[A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\s\-]+\n((?:.*\n){1,4}?\d{5}\s+[A-Z][A-Z\s\-]+)",
|
||||||
|
text[:3000],
|
||||||
|
)
|
||||||
|
if addr_match:
|
||||||
|
result["patient"]["adresse"] = addr_match.group(1).strip()
|
||||||
|
|
||||||
|
# "née le DD/MM/YYYY" ou "né le DD/MM/YYYY"
|
||||||
|
m = re.search(r"n[ée]+\s+le\s+(\d{2}/\d{2}/\d{4})", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["date_naissance"] = m.group(1)
|
||||||
|
|
||||||
|
# Sexe depuis le titre
|
||||||
|
if re.search(r"\bMME\b", text[:2000]):
|
||||||
|
result["patient"]["sexe"] = "F"
|
||||||
|
elif re.search(r"\b(?:M\.|MR)\b", text[:2000]):
|
||||||
|
result["patient"]["sexe"] = "M"
|
||||||
|
|
||||||
|
# "Votre patiente" / "Votre patient"
|
||||||
|
if "patiente" in text[:3000].lower():
|
||||||
|
result["patient"]["sexe"] = "F"
|
||||||
|
elif "patient" in text[:3000].lower():
|
||||||
|
result["patient"].setdefault("sexe", "M")
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_sejour_info(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les dates et motif de séjour."""
|
||||||
|
# "du DD/MM/YYYY au DD/MM/YYYY"
|
||||||
|
m = re.search(
|
||||||
|
r"du\s+(\d{2}/\d{2}/\d{4})\s+au\s+(\d{2}/\d{2}/\d{4})", text
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["date_entree"] = m.group(1)
|
||||||
|
result["sejour"]["date_sortie"] = m.group(2)
|
||||||
|
|
||||||
|
# "pour le motif suivant:" ou "pour le motif suivant :\n..."
|
||||||
|
m = re.search(
|
||||||
|
r"pour\s+le\s+motif\s+suivant\s*[:\s]*\n?(.*?)(?:\n\n|\.\s+[A-Z])",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["motif"] = m.group(1).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_medecins(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les noms de médecins mentionnés."""
|
||||||
|
# "Dr NOM" ou "DR NOM" ou "Dr. NOM" ou "Docteur NOM" ou "Dr F. NOM"
|
||||||
|
for m in re.finditer(
|
||||||
|
r"(?:Dr\.?|DR\.?|Docteur)\s+(?:[A-Z]\.\s+)?([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+(?:\s+[A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+)?)",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
name = m.group(1).strip()
|
||||||
|
if name not in result["medecins"] and len(name) > 2:
|
||||||
|
result["medecins"].append(name)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_medical_content(text: str, result: dict) -> None:
|
||||||
|
"""Extrait le contenu médical principal."""
|
||||||
|
# Chercher après "Mon cher confrère," et les infos d'hospitalisation
|
||||||
|
m = re.search(
|
||||||
|
r"(?:motif\s+suivant\s*[:\s]*\n?)(.*?)(?:Rédigé par|Cordialement|Confraternellement|Dr\s+\w+\s*$)",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["contenu_medical"] = m.group(1).strip()
|
||||||
|
else:
|
||||||
|
# Fallback : prendre tout après "Mon cher confrère"
|
||||||
|
m = re.search(
|
||||||
|
r"Mon cher confrère,?\s*\n(.*?)(?:Rédigé par|$)",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["contenu_medical"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# Sections spécifiques
|
||||||
|
section_patterns = [
|
||||||
|
("motif_hospitalisation", r"(?:motif\s+(?:d'hospitalisation|suivant))\s*[:\s]*\n?(.*?)(?=\n\s*(?:Antécédents|Histoire|Examen|Au total|Devenir|TTT)|$)"),
|
||||||
|
("antecedents", r"(?:Antécédents?)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Histoire|Examen|Traitement|Au total|Devenir)|$)"),
|
||||||
|
("histoire_maladie", r"(?:Histoire de la maladie)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Examen|Biologie|Au total|Devenir)|$)"),
|
||||||
|
("examen_clinique", r"(?:Examen clinique)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Biologie|Imagerie|Au total|Devenir)|$)"),
|
||||||
|
("conclusion", r"(?:Au total|Conclusion)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Devenir|TTT|Traitement)|$)"),
|
||||||
|
("traitement_sortie", r"(?:TTT de sortie|Traitement de sortie)\s*[:\s]*\n?(.*?)(?=\n\s*(?:Devenir|Rédigé|Cordialement)|$)"),
|
||||||
|
("devenir", r"(?:Devenir)\s*[:\s]*\n?(.*?)(?=\n\s*(?:TTT|Traitement|Rédigé|Cordialement)|$)"),
|
||||||
|
]
|
||||||
|
|
||||||
|
for key, pattern in section_patterns:
|
||||||
|
m = re.search(pattern, text, re.DOTALL | re.IGNORECASE)
|
||||||
|
if m:
|
||||||
|
result["sections"][key] = m.group(1).strip()
|
||||||
45
src/extraction/document_classifier.py
Normal file
45
src/extraction/document_classifier.py
Normal file
@@ -0,0 +1,45 @@
|
|||||||
|
"""Détection du type de document : CRH vs Trackare."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
|
||||||
|
def classify(text: str) -> str:
|
||||||
|
"""Classifie un document extrait en CRH ou Trackare.
|
||||||
|
|
||||||
|
Retourne "crh" ou "trackare".
|
||||||
|
"""
|
||||||
|
text_lower = text[:3000].lower()
|
||||||
|
|
||||||
|
trackare_markers = [
|
||||||
|
"dossier patient",
|
||||||
|
"détails des patients",
|
||||||
|
"détails épisode",
|
||||||
|
"liste des contacts",
|
||||||
|
"notes paramédicales",
|
||||||
|
"signes vitaux",
|
||||||
|
"traitements médicamenteux",
|
||||||
|
"observations médicales",
|
||||||
|
]
|
||||||
|
trackare_score = sum(1 for m in trackare_markers if m in text_lower)
|
||||||
|
|
||||||
|
crh_markers = [
|
||||||
|
"mon cher confrère",
|
||||||
|
"cher confrère",
|
||||||
|
"compte rendu d'hospitalisation",
|
||||||
|
"compte-rendu",
|
||||||
|
"service de gastro",
|
||||||
|
"pôle spécialités",
|
||||||
|
"votre patient",
|
||||||
|
]
|
||||||
|
crh_score = sum(1 for m in crh_markers if m in text_lower)
|
||||||
|
|
||||||
|
if trackare_score >= 2:
|
||||||
|
return "trackare"
|
||||||
|
if crh_score >= 2:
|
||||||
|
return "crh"
|
||||||
|
|
||||||
|
# Heuristique : Trackare contient des tableaux avec IPP
|
||||||
|
if "ipp:" in text_lower or "episode no:" in text_lower:
|
||||||
|
return "trackare"
|
||||||
|
|
||||||
|
return "crh"
|
||||||
36
src/extraction/pdf_extractor.py
Normal file
36
src/extraction/pdf_extractor.py
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
"""Extraction de texte et tableaux depuis les PDF via pdfplumber."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pdfplumber
|
||||||
|
|
||||||
|
|
||||||
|
def extract_text(pdf_path: str | Path) -> str:
|
||||||
|
"""Extrait le texte de toutes les pages d'un PDF."""
|
||||||
|
pages_text: list[str] = []
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
text = page.extract_text() or ""
|
||||||
|
pages_text.append(text)
|
||||||
|
return "\n\n".join(pages_text)
|
||||||
|
|
||||||
|
|
||||||
|
def extract_pages(pdf_path: str | Path) -> list[str]:
|
||||||
|
"""Extrait le texte page par page."""
|
||||||
|
pages: list[str] = []
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
pages.append(page.extract_text() or "")
|
||||||
|
return pages
|
||||||
|
|
||||||
|
|
||||||
|
def extract_tables(pdf_path: str | Path) -> list[list[list[str | None]]]:
|
||||||
|
"""Extrait tous les tableaux détectés dans le PDF."""
|
||||||
|
all_tables: list[list[list[str | None]]] = []
|
||||||
|
with pdfplumber.open(pdf_path) as pdf:
|
||||||
|
for page in pdf.pages:
|
||||||
|
tables = page.extract_tables() or []
|
||||||
|
all_tables.extend(tables)
|
||||||
|
return all_tables
|
||||||
419
src/extraction/trackare_parser.py
Normal file
419
src/extraction/trackare_parser.py
Normal file
@@ -0,0 +1,419 @@
|
|||||||
|
"""Parsing des exports Trackare (dossier patient complet)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def parse_trackare(text: str) -> dict:
|
||||||
|
"""Parse un export Trackare et retourne les sections structurées."""
|
||||||
|
result: dict = {
|
||||||
|
"type": "trackare",
|
||||||
|
"patient": {},
|
||||||
|
"sejour": {},
|
||||||
|
"contacts": [],
|
||||||
|
"medecins": [],
|
||||||
|
"urgences": {},
|
||||||
|
"observations_medicales": [],
|
||||||
|
"notes_paramedicales": [],
|
||||||
|
"signes_vitaux": {},
|
||||||
|
"diagnostics": [],
|
||||||
|
"traitements": [],
|
||||||
|
"contenu_medical": "",
|
||||||
|
}
|
||||||
|
|
||||||
|
_extract_patient_info(text, result)
|
||||||
|
_extract_sejour_info(text, result)
|
||||||
|
_extract_contacts(text, result)
|
||||||
|
_extract_medecins(text, result)
|
||||||
|
_extract_urgences(text, result)
|
||||||
|
_extract_observations(text, result)
|
||||||
|
_extract_notes_param(text, result)
|
||||||
|
_extract_diagnostics(text, result)
|
||||||
|
_extract_traitements(text, result)
|
||||||
|
_extract_vitals(text, result)
|
||||||
|
_build_medical_content(result)
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_patient_info(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les informations du bloc 'Détails des patients'."""
|
||||||
|
# Nom de naissance
|
||||||
|
m = re.search(r"Nom de naissance:\s*(\S+)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["nom_naissance"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# Nom et Prénom
|
||||||
|
m = re.search(r"Nom et Prénom:\s*(.+?)(?:\s+Date de naissance|\n)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["nom_prenom"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# IPP
|
||||||
|
m = re.search(r"IPP:\s*(\d+)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["ipp"] = m.group(1)
|
||||||
|
|
||||||
|
# Date de naissance
|
||||||
|
m = re.search(r"Date de naissance:\s*(\d{2}/\d{2}/\d{4})", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["date_naissance"] = m.group(1)
|
||||||
|
|
||||||
|
# Sexe
|
||||||
|
m = re.search(r"Sexe:\s*(\S+)", text)
|
||||||
|
if m:
|
||||||
|
sexe_raw = m.group(1).strip().lower()
|
||||||
|
result["patient"]["sexe"] = "F" if "fém" in sexe_raw else "M"
|
||||||
|
|
||||||
|
# Lieu de naissance
|
||||||
|
m = re.search(r"Lieu de naissance:\s*(.+?)(?:\n|$)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["lieu_naissance"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# Adresse
|
||||||
|
m = re.search(r"Adresse:\s*(.+?)(?:\s+Ville de résidence|\n)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["adresse"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# Code postal et ville
|
||||||
|
m = re.search(r"Code Postal:\s*(\d{5})", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["code_postal"] = m.group(1)
|
||||||
|
m = re.search(r"Ville de résidence:\s*(.+?)(?:\n|$)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["ville"] = m.group(1).strip()
|
||||||
|
|
||||||
|
# Taille, Poids, IMC (footer)
|
||||||
|
m = re.search(r"Taille:\s*(\d+)\s*cm", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["taille_cm"] = int(m.group(1))
|
||||||
|
m = re.search(r"Poids:\s*([\d.]+)\s*kg", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["poids_kg"] = float(m.group(1))
|
||||||
|
m = re.search(r"IMC:\s*([\d.]+)", text)
|
||||||
|
if m:
|
||||||
|
result["patient"]["imc"] = float(m.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_sejour_info(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les détails de l'épisode."""
|
||||||
|
m = re.search(r"Episode No:\s*(\d+)", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["episode"] = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r"Date d'admission:\s*(\d{2}/\d{2}/\d{4})", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["date_entree"] = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r"Heure d'admission:\s*(\d{2}:\d{2})", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["heure_entree"] = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r"Date de sortie:\s*(\d{2}/\d{2}/\d{4})", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["date_sortie"] = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r"Heure de sortie:\s*(\d{2}:\d{2})", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["heure_sortie"] = m.group(1)
|
||||||
|
|
||||||
|
m = re.search(r"Localisation:\s*(.+?)(?:\s+Médecin courant|\n)", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["service"] = m.group(1).strip()
|
||||||
|
|
||||||
|
m = re.search(r"Médecin courant:\s*(.+?)(?:\n|$)", text)
|
||||||
|
if m:
|
||||||
|
result["sejour"]["medecin_courant"] = m.group(1).strip()
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_contacts(text: str, result: dict) -> None:
|
||||||
|
"""Extrait la liste des contacts."""
|
||||||
|
# Bloc "Liste des contacts"
|
||||||
|
contact_block = re.search(
|
||||||
|
r"Liste des contacts\n(.*?)(?=Passage aux Urgences|Signes Vitaux|Observations médicales)",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not contact_block:
|
||||||
|
return
|
||||||
|
|
||||||
|
block = contact_block.group(1)
|
||||||
|
# Chaque ligne de contact contient relation, nom, prénom, tél
|
||||||
|
for line in block.split("\n"):
|
||||||
|
line = line.strip()
|
||||||
|
if not line or line.startswith("Type de contact") or line.startswith("Tél"):
|
||||||
|
continue
|
||||||
|
# Chercher les noms et téléphones
|
||||||
|
tel_match = re.search(r"(\d{2}[.\-\s]\d{2}[.\-\s]\d{2}[.\-\s]\d{2}[.\-\s]\d{2})", line)
|
||||||
|
if tel_match or re.search(r"(?:Epoux|Époux|Épouse|Conjoint|Père|Mère|Fils|Fille|Frère|Soeur)", line, re.IGNORECASE):
|
||||||
|
result["contacts"].append(line)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_medecins(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les noms de médecins/soignants."""
|
||||||
|
seen: set[str] = set()
|
||||||
|
|
||||||
|
def _add(name: str) -> None:
|
||||||
|
name = _clean_person_name(name)
|
||||||
|
if name and len(name) > 2 and name.lower() not in seen:
|
||||||
|
seen.add(name.lower())
|
||||||
|
result["medecins"].append(name)
|
||||||
|
|
||||||
|
# "DR. Prénom NOM" ou "Dr NOM" ou "Docteur NOM Prénom"
|
||||||
|
for m in re.finditer(
|
||||||
|
r"(?:DR\.?|Dr\.?|Docteur)\s+([A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+){0,2})",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
_add(m.group(1))
|
||||||
|
|
||||||
|
# Auteurs d'observations : "Note d'évolution NOM Prénom DD/MM/YYYY"
|
||||||
|
# ou multi-ligne "Note IDE Prénom\nNOM DD/MM/YYYY"
|
||||||
|
for m in re.finditer(
|
||||||
|
r"(?:Note d'évolution|Note IDE|Histoire de la maladie|Conclusion Obs\.?\s*médicales?)\s+"
|
||||||
|
r"(?:DR\.?\s+)?"
|
||||||
|
r"([A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:[\s\n]+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+)*?)"
|
||||||
|
r"\s+\d{2}/\d{2}/\d{4}",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
_add(m.group(1))
|
||||||
|
|
||||||
|
# Médecin de prise en charge / décision médicale
|
||||||
|
for m in re.finditer(
|
||||||
|
r"(?:Médecin de (?:la )?(?:prise en charge|décision)\s+médicale)\s+"
|
||||||
|
r"([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+){0,2})",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
_add(m.group(1))
|
||||||
|
|
||||||
|
# IAO NOM Prénom
|
||||||
|
for m in re.finditer(
|
||||||
|
r"IAO\s+([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+){0,2})",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
_add(m.group(1))
|
||||||
|
|
||||||
|
# Prénom seul sur la ligne avant "DD/MM/YYYY...Note IDE...\nNOM HH:MM"
|
||||||
|
# Ex: "Argitxu 02/03/2023\nNote IDE ...\nHIRIGOYEN 14:05"
|
||||||
|
# ou "Stephanie 27/02/2023 TDM fait et à voir\nNote IDE\nCONSTANTIN 08:54"
|
||||||
|
for m in re.finditer(
|
||||||
|
r"([A-ZÉÈÊËÀÂ][a-zéèêëàâäùûüôöîïç]+)\s+\d{2}/\d{2}/\d{4}[^\n]*\n\s*Note IDE[^\n]*\n\s*([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂa-zéèêëàâ\-]+)\s+\d{2}:\d{2}",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
prenom = m.group(1)
|
||||||
|
nom = m.group(2)
|
||||||
|
_add(f"{prenom} {nom}")
|
||||||
|
|
||||||
|
|
||||||
|
# Mots qui ne sont pas des noms de personnes
|
||||||
|
_NOT_NAMES = {
|
||||||
|
"non", "pas", "une", "des", "les", "par", "sur", "pour", "dans",
|
||||||
|
"avec", "sans", "qui", "que", "est", "sont", "date", "heure",
|
||||||
|
"cholecystectomie", "cholécystectomie", "cholangiographie",
|
||||||
|
"complication", "vasculaire", "nécessaire", "donc", "note",
|
||||||
|
"douleurs", "absence", "douleur", "lotissement", "priorité",
|
||||||
|
"prescriptions", "technique", "alimentaire", "signé", "réalisé",
|
||||||
|
"selles", "covid", "devenir", "algique", "normal", "regime",
|
||||||
|
"reprise", "biprofenid", "orale", "gelule", "comprime",
|
||||||
|
"glyc", "inj", "lipase", "protéines", "ionogramme",
|
||||||
|
"créatinine", "glucose", "num", "crp", "ta", "bilirubine",
|
||||||
|
"tp", "tca", "bh", "bs", "sortie", "transfert",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def _clean_person_name(raw: str) -> str:
|
||||||
|
"""Nettoie un nom extrait en supprimant le texte parasite."""
|
||||||
|
name = re.sub(r"\n+", " ", raw).strip()
|
||||||
|
parts = name.split()
|
||||||
|
clean: list[str] = []
|
||||||
|
for part in parts:
|
||||||
|
p = part.strip(".-")
|
||||||
|
if not p:
|
||||||
|
continue
|
||||||
|
if p.lower() in _NOT_NAMES:
|
||||||
|
break
|
||||||
|
# Un mot-nom : commence par une majuscule
|
||||||
|
if re.match(r"^[A-ZÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ]", p):
|
||||||
|
clean.append(p)
|
||||||
|
else:
|
||||||
|
break
|
||||||
|
result = " ".join(clean).strip()
|
||||||
|
# Rejeter si un seul mot de 1-2 lettres (initiale)
|
||||||
|
if len(result) <= 2:
|
||||||
|
return ""
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_urgences(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les données du passage aux urgences."""
|
||||||
|
urg_block = re.search(
|
||||||
|
r"Passage aux Urgences\n(.*?)(?=Signes Vitaux|Observations médicales|Antécédents)",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not urg_block:
|
||||||
|
return
|
||||||
|
|
||||||
|
block = urg_block.group(1)
|
||||||
|
|
||||||
|
m = re.search(r"Mode de transport.*?:\s*(.+)", block)
|
||||||
|
if m:
|
||||||
|
result["urgences"]["mode_transport"] = m.group(1).strip()
|
||||||
|
|
||||||
|
m = re.search(r"Mode d'entrée\s+(.+)", block)
|
||||||
|
if m:
|
||||||
|
result["urgences"]["mode_entree"] = m.group(1).strip()
|
||||||
|
|
||||||
|
m = re.search(r"Priorité\s+(Priorité \d)", block)
|
||||||
|
if m:
|
||||||
|
result["urgences"]["priorite"] = m.group(1)
|
||||||
|
|
||||||
|
# Motifs de prise en charge
|
||||||
|
motifs = re.findall(
|
||||||
|
r"Motif de prise en charge\s+(.+?)(?=\n(?:Observ\.|Médecin|Date|IAO))",
|
||||||
|
block,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if motifs:
|
||||||
|
result["urgences"]["motifs"] = [
|
||||||
|
line.strip()
|
||||||
|
for motif in motifs
|
||||||
|
for line in motif.split("\n")
|
||||||
|
if line.strip()
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_observations(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les observations médicales."""
|
||||||
|
obs_block = re.search(
|
||||||
|
r"Observations médicales\n(.*?)(?=Notes paramédicales|Surveillance Psychiatrie|Traitements médicamenteux|$)",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not obs_block:
|
||||||
|
return
|
||||||
|
|
||||||
|
block = obs_block.group(1)
|
||||||
|
|
||||||
|
# Découper par type d'observation
|
||||||
|
entries = re.split(
|
||||||
|
r"(Note d'évolution|Conclusion Obs\.\s*médicales|Histoire de la maladie)",
|
||||||
|
block,
|
||||||
|
)
|
||||||
|
|
||||||
|
i = 1
|
||||||
|
while i < len(entries) - 1:
|
||||||
|
obs_type = entries[i].strip()
|
||||||
|
content = entries[i + 1].strip()
|
||||||
|
|
||||||
|
# Extraire auteur et date
|
||||||
|
m = re.match(
|
||||||
|
r"(?:DR\.?\s+)?([A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+(?:\s+[A-ZÉÈÊËÀÂa-zéèêëàâ\.\-]+)*)\s+(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s*(.*)",
|
||||||
|
content,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
result["observations_medicales"].append({
|
||||||
|
"type": obs_type,
|
||||||
|
"auteur": m.group(1).strip(),
|
||||||
|
"date": m.group(2),
|
||||||
|
"heure": m.group(3),
|
||||||
|
"contenu": m.group(4).strip(),
|
||||||
|
})
|
||||||
|
else:
|
||||||
|
result["observations_medicales"].append({
|
||||||
|
"type": obs_type,
|
||||||
|
"contenu": content,
|
||||||
|
})
|
||||||
|
i += 2
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_notes_param(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les notes paramédicales."""
|
||||||
|
notes_block = re.search(
|
||||||
|
r"Notes paramédicales\n(.*?)(?=Traitements médicamenteux|Surveillance|$)",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not notes_block:
|
||||||
|
return
|
||||||
|
|
||||||
|
block = notes_block.group(1)
|
||||||
|
for m in re.finditer(
|
||||||
|
r"Note IDE\s+([A-Za-zéèêëàâäùûüôöîïçÉÈÊËÀÂÄÙÛÜÔÖÎÏÇ\.\-\s]+?)\s+(\d{2}/\d{2}/\d{4})\s+(\d{2}:\d{2})\s+(.*?)(?=Note IDE|$)",
|
||||||
|
block,
|
||||||
|
re.DOTALL,
|
||||||
|
):
|
||||||
|
result["notes_paramedicales"].append({
|
||||||
|
"auteur": m.group(1).strip(),
|
||||||
|
"date": m.group(2),
|
||||||
|
"heure": m.group(3),
|
||||||
|
"contenu": m.group(4).strip(),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_diagnostics(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les diagnostics codés."""
|
||||||
|
# "Principal actif CODE DESCRIPTION"
|
||||||
|
for m in re.finditer(
|
||||||
|
r"(Principal|Associé|Significatif)\s+(actif|inactif)\s+([A-Z]\d{2}(?:\.\d{1,2})?)\s+(.+?)(?:\s+\[.*?\])?\s+\d{2}/\d{2}/\d{4}",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
result["diagnostics"].append({
|
||||||
|
"type": m.group(1),
|
||||||
|
"statut": m.group(2),
|
||||||
|
"code_cim10": m.group(3),
|
||||||
|
"libelle": m.group(4).strip(),
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_traitements(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les traitements médicamenteux."""
|
||||||
|
ttt_block = re.search(
|
||||||
|
r"Traitements médicamenteux\n(.*?)$",
|
||||||
|
text,
|
||||||
|
re.DOTALL,
|
||||||
|
)
|
||||||
|
if not ttt_block:
|
||||||
|
return
|
||||||
|
|
||||||
|
block = ttt_block.group(1)
|
||||||
|
# Chercher les noms de médicaments (en majuscules)
|
||||||
|
for m in re.finditer(
|
||||||
|
r"([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂ0-9\s\-/%.,'`]+(?:MG|ML|SOL|INJ|CPR|GEL|AMP|POCHE)[A-ZÉÈÊËÀÂ0-9\s\-/%.,'`\(\)\[\]]*)\s+([\d\s]+\s*(?:mg|G|GEL|CPR|AMP|ML)?)\s*[-–]\s*(.+?)(?=\n[A-Z]|\Z)",
|
||||||
|
block,
|
||||||
|
re.DOTALL,
|
||||||
|
):
|
||||||
|
result["traitements"].append({
|
||||||
|
"medicament": m.group(1).strip(),
|
||||||
|
"dose": m.group(2).strip(),
|
||||||
|
"frequence": m.group(3).strip().split("\n")[0],
|
||||||
|
})
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_vitals(text: str, result: dict) -> None:
|
||||||
|
"""Extrait les données anthropométriques clés."""
|
||||||
|
m = re.search(r"Taille \[cm\]\s+([\d.]+)", text)
|
||||||
|
if m:
|
||||||
|
result["signes_vitaux"]["taille_cm"] = float(m.group(1))
|
||||||
|
m = re.search(r"Poids \[kg\]\s+([\d.]+)", text)
|
||||||
|
if m:
|
||||||
|
result["signes_vitaux"]["poids_kg"] = float(m.group(1))
|
||||||
|
m = re.search(r"Indice\s*\n?\s*de masse\s+([\d.]+)", text)
|
||||||
|
if m:
|
||||||
|
result["signes_vitaux"]["imc"] = float(m.group(1))
|
||||||
|
|
||||||
|
|
||||||
|
def _build_medical_content(result: dict) -> None:
|
||||||
|
"""Construit le texte médical complet à partir des observations."""
|
||||||
|
parts: list[str] = []
|
||||||
|
|
||||||
|
if result["urgences"].get("motifs"):
|
||||||
|
parts.append("Motifs: " + ", ".join(result["urgences"]["motifs"]))
|
||||||
|
|
||||||
|
for obs in result["observations_medicales"]:
|
||||||
|
parts.append(obs.get("contenu", ""))
|
||||||
|
|
||||||
|
for note in result["notes_paramedicales"]:
|
||||||
|
parts.append(note.get("contenu", ""))
|
||||||
|
|
||||||
|
result["contenu_medical"] = "\n\n".join(parts)
|
||||||
184
src/main.py
Normal file
184
src/main.py
Normal file
@@ -0,0 +1,184 @@
|
|||||||
|
"""CLI + orchestrateur du pipeline d'anonymisation et extraction CIM-10."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import sys
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
from .anonymization.anonymizer import Anonymizer
|
||||||
|
from .config import ANONYMIZED_DIR, REPORTS_DIR, STRUCTURED_DIR, AnonymizationReport, DossierMedical
|
||||||
|
from .extraction.document_classifier import classify
|
||||||
|
from .extraction.crh_parser import parse_crh
|
||||||
|
from .extraction.pdf_extractor import extract_text
|
||||||
|
from .extraction.trackare_parser import parse_trackare
|
||||||
|
from .medical.cim10_extractor import extract_medical_info
|
||||||
|
|
||||||
|
logging.basicConfig(
|
||||||
|
level=logging.INFO,
|
||||||
|
format="%(asctime)s [%(levelname)s] %(name)s: %(message)s",
|
||||||
|
)
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
# Flag global pour désactiver edsnlp
|
||||||
|
_use_edsnlp = True
|
||||||
|
|
||||||
|
|
||||||
|
def process_pdf(pdf_path: Path) -> tuple[str, DossierMedical, AnonymizationReport]:
|
||||||
|
"""Traite un PDF : extraction → parsing → anonymisation → extraction CIM-10."""
|
||||||
|
logger.info("Traitement de %s", pdf_path.name)
|
||||||
|
|
||||||
|
# 1. Extraction texte
|
||||||
|
raw_text = extract_text(pdf_path)
|
||||||
|
logger.info(" Texte extrait : %d caractères", len(raw_text))
|
||||||
|
|
||||||
|
# 2. Classification
|
||||||
|
doc_type = classify(raw_text)
|
||||||
|
logger.info(" Type de document : %s", doc_type)
|
||||||
|
|
||||||
|
# 3. Parsing
|
||||||
|
if doc_type == "trackare":
|
||||||
|
parsed = parse_trackare(raw_text)
|
||||||
|
else:
|
||||||
|
parsed = parse_crh(raw_text)
|
||||||
|
|
||||||
|
# 4. Anonymisation
|
||||||
|
anonymizer = Anonymizer(parsed_data=parsed)
|
||||||
|
anonymized_text = anonymizer.anonymize(raw_text)
|
||||||
|
report = anonymizer.report
|
||||||
|
report.source_file = pdf_path.name
|
||||||
|
logger.info(
|
||||||
|
" Anonymisation : %d remplacements (regex=%d, ner=%d, sweep=%d)",
|
||||||
|
report.total_replacements,
|
||||||
|
report.regex_replacements,
|
||||||
|
report.ner_replacements,
|
||||||
|
report.sweep_replacements,
|
||||||
|
)
|
||||||
|
|
||||||
|
# 5. Analyse edsnlp (optionnelle)
|
||||||
|
edsnlp_result = None
|
||||||
|
if _use_edsnlp:
|
||||||
|
edsnlp_result = _run_edsnlp(anonymized_text)
|
||||||
|
|
||||||
|
# 6. Extraction médicale CIM-10
|
||||||
|
dossier = extract_medical_info(parsed, anonymized_text, edsnlp_result)
|
||||||
|
dossier.source_file = pdf_path.name
|
||||||
|
dossier.document_type = doc_type
|
||||||
|
logger.info(" DP : %s", dossier.diagnostic_principal)
|
||||||
|
logger.info(" DAS : %d, Actes : %d", len(dossier.diagnostics_associes), len(dossier.actes_ccam))
|
||||||
|
|
||||||
|
return anonymized_text, dossier, report
|
||||||
|
|
||||||
|
|
||||||
|
def _run_edsnlp(text: str):
|
||||||
|
"""Exécute l'analyse edsnlp avec fallback gracieux."""
|
||||||
|
try:
|
||||||
|
from .medical.edsnlp_pipeline import analyze, is_available
|
||||||
|
if not is_available():
|
||||||
|
logger.info(" edsnlp non disponible, utilisation du mode regex seul")
|
||||||
|
return None
|
||||||
|
result = analyze(text)
|
||||||
|
logger.info(
|
||||||
|
" edsnlp : %d CIM-10, %d médicaments, %d dates",
|
||||||
|
len(result.cim10_entities),
|
||||||
|
len(result.drug_entities),
|
||||||
|
len(result.date_entities),
|
||||||
|
)
|
||||||
|
return result
|
||||||
|
except Exception:
|
||||||
|
logger.warning(" edsnlp : erreur lors de l'analyse, fallback regex", exc_info=True)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def write_outputs(
|
||||||
|
stem: str,
|
||||||
|
anonymized_text: str,
|
||||||
|
dossier: DossierMedical,
|
||||||
|
report: AnonymizationReport,
|
||||||
|
) -> None:
|
||||||
|
"""Écrit les fichiers de sortie."""
|
||||||
|
# Texte anonymisé
|
||||||
|
anon_path = ANONYMIZED_DIR / f"{stem}_anonymized.txt"
|
||||||
|
anon_path.write_text(anonymized_text, encoding="utf-8")
|
||||||
|
logger.info(" → %s", anon_path)
|
||||||
|
|
||||||
|
# JSON structuré
|
||||||
|
json_path = STRUCTURED_DIR / f"{stem}_cim10.json"
|
||||||
|
json_path.write_text(
|
||||||
|
dossier.model_dump_json(indent=2, exclude_none=True),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
logger.info(" → %s", json_path)
|
||||||
|
|
||||||
|
# Rapport d'anonymisation
|
||||||
|
report_path = REPORTS_DIR / f"{stem}_report.json"
|
||||||
|
report_path.write_text(
|
||||||
|
report.model_dump_json(indent=2),
|
||||||
|
encoding="utf-8",
|
||||||
|
)
|
||||||
|
logger.info(" → %s", report_path)
|
||||||
|
|
||||||
|
|
||||||
|
def main(input_path: str | None = None) -> None:
|
||||||
|
"""Point d'entrée principal."""
|
||||||
|
global _use_edsnlp
|
||||||
|
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Anonymisation de documents médicaux PDF et extraction CIM-10",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"input",
|
||||||
|
nargs="?",
|
||||||
|
default=input_path or "input/",
|
||||||
|
help="Chemin vers un PDF ou un dossier de PDFs (défaut: input/)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-ner",
|
||||||
|
action="store_true",
|
||||||
|
help="Désactiver la phase NER (plus rapide, moins précis)",
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--no-edsnlp",
|
||||||
|
action="store_true",
|
||||||
|
help="Désactiver l'analyse edsnlp (mode regex seul)",
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
if args.no_ner:
|
||||||
|
# Monkey-patch pour désactiver NER
|
||||||
|
from .anonymization import ner_anonymizer
|
||||||
|
ner_anonymizer.extract_person_entities = lambda text: []
|
||||||
|
|
||||||
|
if args.no_edsnlp:
|
||||||
|
_use_edsnlp = False
|
||||||
|
|
||||||
|
input_p = Path(args.input)
|
||||||
|
if input_p.is_file():
|
||||||
|
pdfs = [input_p]
|
||||||
|
elif input_p.is_dir():
|
||||||
|
pdfs = sorted(input_p.glob("*.pdf"))
|
||||||
|
else:
|
||||||
|
logger.error("Chemin introuvable : %s", input_p)
|
||||||
|
sys.exit(1)
|
||||||
|
|
||||||
|
if not pdfs:
|
||||||
|
logger.warning("Aucun PDF trouvé dans %s", input_p)
|
||||||
|
sys.exit(0)
|
||||||
|
|
||||||
|
logger.info("Traitement de %d PDF(s)...", len(pdfs))
|
||||||
|
|
||||||
|
for pdf_path in pdfs:
|
||||||
|
try:
|
||||||
|
anonymized_text, dossier, report = process_pdf(pdf_path)
|
||||||
|
stem = pdf_path.stem.replace(" ", "_")
|
||||||
|
write_outputs(stem, anonymized_text, dossier, report)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Erreur lors du traitement de %s", pdf_path.name)
|
||||||
|
|
||||||
|
logger.info("Terminé.")
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
0
src/medical/__init__.py
Normal file
0
src/medical/__init__.py
Normal file
606
src/medical/cim10_extractor.py
Normal file
606
src/medical/cim10_extractor.py
Normal file
@@ -0,0 +1,606 @@
|
|||||||
|
"""Extraction d'informations médicales structurées pour le codage CIM-10."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import re
|
||||||
|
from datetime import datetime
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
from ..config import (
|
||||||
|
ActeCCAM,
|
||||||
|
BiologieCle,
|
||||||
|
Diagnostic,
|
||||||
|
DossierMedical,
|
||||||
|
Imagerie,
|
||||||
|
Sejour,
|
||||||
|
Traitement,
|
||||||
|
)
|
||||||
|
|
||||||
|
try:
|
||||||
|
from .edsnlp_pipeline import EdsnlpResult
|
||||||
|
except ImportError:
|
||||||
|
EdsnlpResult = None # type: ignore[assignment,misc]
|
||||||
|
|
||||||
|
# Mapping diagnostics fréquents → codes CIM-10
|
||||||
|
CIM10_MAP: dict[str, str] = {
|
||||||
|
# Pancréatite
|
||||||
|
"pancréatite aiguë biliaire": "K85.1",
|
||||||
|
"pancréatite aigue biliaire": "K85.1",
|
||||||
|
"pancréatite aiguë lithiasique": "K85.1",
|
||||||
|
"pancréatite aigue lithiasique": "K85.1",
|
||||||
|
"pancréatite aiguë": "K85.9",
|
||||||
|
"pancréatite aigue": "K85.9",
|
||||||
|
"pancréatite": "K85.9",
|
||||||
|
# Lithiases biliaires
|
||||||
|
"lithiase cholédoque": "K80.5",
|
||||||
|
"lithiase du cholédoque": "K80.5",
|
||||||
|
"calcul des canaux biliaires": "K80.5",
|
||||||
|
"lithiase vésiculaire": "K80.2",
|
||||||
|
"lithiases vésiculaires": "K80.2",
|
||||||
|
"vésicule lithiasique": "K80.2",
|
||||||
|
"colique hépatique": "K80.2",
|
||||||
|
# Cholécystite
|
||||||
|
"cholécystite aiguë": "K81.0",
|
||||||
|
"cholecystite aigue": "K81.0",
|
||||||
|
"angiocholite": "K83.0",
|
||||||
|
# Obésité
|
||||||
|
"obésité": "E66.0",
|
||||||
|
"obesite": "E66.0",
|
||||||
|
"surpoids": "E66.0",
|
||||||
|
# Réactions médicamenteuses
|
||||||
|
"éruption médicamenteuse": "L27.0",
|
||||||
|
"eruption medicamenteuse": "L27.0",
|
||||||
|
"éruption cutanée médicamenteuse": "L27.0",
|
||||||
|
"toxidermie": "L27.0",
|
||||||
|
"réaction au tramadol": "L27.0",
|
||||||
|
"allergie médicamenteuse": "T88.7",
|
||||||
|
# Douleur
|
||||||
|
"douleur abdominale": "R10.4",
|
||||||
|
"douleur hypochondre droit": "R10.1",
|
||||||
|
# Ictère
|
||||||
|
"ictère": "R17",
|
||||||
|
"jaunisse": "R17",
|
||||||
|
# HTA
|
||||||
|
"hypertension artérielle": "I10",
|
||||||
|
"hta": "I10",
|
||||||
|
# Diabète
|
||||||
|
"diabète type 2": "E11.9",
|
||||||
|
"diabète de type 2": "E11.9",
|
||||||
|
"diabète type 1": "E10.9",
|
||||||
|
}
|
||||||
|
|
||||||
|
# Mapping actes → codes CCAM
|
||||||
|
CCAM_MAP: dict[str, str] = {
|
||||||
|
"cholécystectomie": "HMFC004",
|
||||||
|
"cholecystectomie": "HMFC004",
|
||||||
|
"cholécystectomie par cœlioscopie": "HMFC004",
|
||||||
|
"cholecystectomie par coelioscopie": "HMFC004",
|
||||||
|
"cholangiographie": "HHHE002",
|
||||||
|
"cholangiographie peropératoire": "HHHE002",
|
||||||
|
"cpre": "HHHE002",
|
||||||
|
"sphinctérotomie endoscopique": "HHHE003",
|
||||||
|
"scanner abdominal": "ZCQK002",
|
||||||
|
"tdm abdominal": "ZCQK002",
|
||||||
|
"échographie abdominale": "ZCQJ001",
|
||||||
|
"echo abdominale": "ZCQJ001",
|
||||||
|
"irm abdominale": "ZCQN001",
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
|
def extract_medical_info(
|
||||||
|
parsed_data: dict,
|
||||||
|
anonymized_text: str,
|
||||||
|
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||||
|
) -> DossierMedical:
|
||||||
|
"""Extrait les informations médicales structurées depuis les données parsées et le texte."""
|
||||||
|
dossier = DossierMedical()
|
||||||
|
dossier.document_type = parsed_data.get("type", "")
|
||||||
|
|
||||||
|
_extract_sejour(parsed_data, dossier)
|
||||||
|
_extract_diagnostics(parsed_data, anonymized_text, dossier, edsnlp_result)
|
||||||
|
_extract_actes(anonymized_text, dossier)
|
||||||
|
_extract_antecedents(anonymized_text, dossier)
|
||||||
|
_extract_traitements(parsed_data, anonymized_text, dossier, edsnlp_result)
|
||||||
|
_extract_biologie(anonymized_text, dossier)
|
||||||
|
_extract_imagerie(anonymized_text, dossier)
|
||||||
|
_extract_complications(anonymized_text, dossier, edsnlp_result)
|
||||||
|
|
||||||
|
return dossier
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_sejour(parsed: dict, dossier: DossierMedical) -> None:
|
||||||
|
"""Extrait les informations de séjour."""
|
||||||
|
patient = parsed.get("patient", {})
|
||||||
|
sejour_data = parsed.get("sejour", {})
|
||||||
|
|
||||||
|
dossier.sejour = Sejour(
|
||||||
|
sexe=patient.get("sexe"),
|
||||||
|
date_entree=sejour_data.get("date_entree"),
|
||||||
|
date_sortie=sejour_data.get("date_sortie"),
|
||||||
|
mode_entree=parsed.get("urgences", {}).get("mode_entree"),
|
||||||
|
)
|
||||||
|
|
||||||
|
# Calcul de l'âge à partir de la date de naissance et de la date d'entrée
|
||||||
|
dob = patient.get("date_naissance")
|
||||||
|
date_entree = sejour_data.get("date_entree")
|
||||||
|
if dob and date_entree:
|
||||||
|
try:
|
||||||
|
dob_dt = datetime.strptime(dob, "%d/%m/%Y")
|
||||||
|
entree_dt = datetime.strptime(date_entree, "%d/%m/%Y")
|
||||||
|
age = entree_dt.year - dob_dt.year
|
||||||
|
if (entree_dt.month, entree_dt.day) < (dob_dt.month, dob_dt.day):
|
||||||
|
age -= 1
|
||||||
|
dossier.sejour.age = age
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# Durée de séjour
|
||||||
|
if sejour_data.get("date_entree") and sejour_data.get("date_sortie"):
|
||||||
|
try:
|
||||||
|
d1 = datetime.strptime(sejour_data["date_entree"], "%d/%m/%Y")
|
||||||
|
d2 = datetime.strptime(sejour_data["date_sortie"], "%d/%m/%Y")
|
||||||
|
dossier.sejour.duree_sejour = (d2 - d1).days
|
||||||
|
except ValueError:
|
||||||
|
pass
|
||||||
|
|
||||||
|
# IMC, poids, taille
|
||||||
|
vitals = parsed.get("signes_vitaux", {})
|
||||||
|
if vitals.get("imc"):
|
||||||
|
dossier.sejour.imc = vitals["imc"]
|
||||||
|
elif patient.get("imc"):
|
||||||
|
dossier.sejour.imc = patient["imc"]
|
||||||
|
|
||||||
|
if vitals.get("poids_kg"):
|
||||||
|
dossier.sejour.poids = vitals["poids_kg"]
|
||||||
|
elif patient.get("poids_kg"):
|
||||||
|
dossier.sejour.poids = patient["poids_kg"]
|
||||||
|
|
||||||
|
if vitals.get("taille_cm"):
|
||||||
|
dossier.sejour.taille = vitals["taille_cm"]
|
||||||
|
elif patient.get("taille_cm"):
|
||||||
|
dossier.sejour.taille = patient["taille_cm"]
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_diagnostics(
|
||||||
|
parsed: dict,
|
||||||
|
text: str,
|
||||||
|
dossier: DossierMedical,
|
||||||
|
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Extrait le diagnostic principal et les diagnostics associés."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# Diagnostics codés depuis Trackare (prioritaires)
|
||||||
|
for diag in parsed.get("diagnostics", []):
|
||||||
|
d = Diagnostic(
|
||||||
|
texte=diag.get("libelle", ""),
|
||||||
|
cim10_suggestion=diag.get("code_cim10"),
|
||||||
|
)
|
||||||
|
if diag.get("type", "").lower() == "principal":
|
||||||
|
dossier.diagnostic_principal = d
|
||||||
|
else:
|
||||||
|
dossier.diagnostics_associes.append(d)
|
||||||
|
|
||||||
|
# Extraction du texte "Au total:" ou conclusion
|
||||||
|
conclusion = ""
|
||||||
|
m = re.search(
|
||||||
|
r"Au total\s*[::]?\s*(.*?)(?=\n\s*(?:Devenir|TTT|Sortie|$))",
|
||||||
|
text,
|
||||||
|
re.DOTALL | re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
conclusion = m.group(1).strip()
|
||||||
|
|
||||||
|
# Enrichissement via edsnlp (CIM-10)
|
||||||
|
edsnlp_codes: dict[str, str] = {}
|
||||||
|
if edsnlp_result:
|
||||||
|
for ent in edsnlp_result.cim10_entities:
|
||||||
|
if not ent.negation and not ent.hypothese:
|
||||||
|
edsnlp_codes[ent.code] = ent.texte
|
||||||
|
|
||||||
|
# Si pas de DP depuis le codage, chercher dans le texte
|
||||||
|
if not dossier.diagnostic_principal:
|
||||||
|
# D'abord essayer le fallback regex (plus précis pour les patterns spécifiques)
|
||||||
|
dp = _find_diagnostic_principal(text_lower, conclusion)
|
||||||
|
if dp:
|
||||||
|
dossier.diagnostic_principal = dp
|
||||||
|
elif edsnlp_codes:
|
||||||
|
# Utiliser la première entité CIM-10 edsnlp comme DP
|
||||||
|
code, texte = next(iter(edsnlp_codes.items()))
|
||||||
|
dossier.diagnostic_principal = Diagnostic(
|
||||||
|
texte=texte.capitalize(), cim10_suggestion=code,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Diagnostics associés depuis le texte (regex)
|
||||||
|
das = _find_diagnostics_associes(text_lower, conclusion, dossier)
|
||||||
|
dossier.diagnostics_associes.extend(das)
|
||||||
|
|
||||||
|
# Enrichissement DAS depuis edsnlp
|
||||||
|
if edsnlp_result:
|
||||||
|
existing_codes = set()
|
||||||
|
if dossier.diagnostic_principal:
|
||||||
|
existing_codes.add(dossier.diagnostic_principal.cim10_suggestion)
|
||||||
|
for d in dossier.diagnostics_associes:
|
||||||
|
existing_codes.add(d.cim10_suggestion)
|
||||||
|
|
||||||
|
for ent in edsnlp_result.cim10_entities:
|
||||||
|
if ent.negation or ent.hypothese:
|
||||||
|
continue
|
||||||
|
if ent.code not in existing_codes:
|
||||||
|
dossier.diagnostics_associes.append(Diagnostic(
|
||||||
|
texte=ent.texte.capitalize(),
|
||||||
|
cim10_suggestion=ent.code,
|
||||||
|
))
|
||||||
|
existing_codes.add(ent.code)
|
||||||
|
|
||||||
|
|
||||||
|
def _find_diagnostic_principal(text_lower: str, conclusion: str) -> Diagnostic | None:
|
||||||
|
"""Trouve le diagnostic principal dans le texte."""
|
||||||
|
conclusion_lower = conclusion.lower()
|
||||||
|
|
||||||
|
# Chercher dans la conclusion d'abord
|
||||||
|
for terme, code in CIM10_MAP.items():
|
||||||
|
if terme in conclusion_lower:
|
||||||
|
return Diagnostic(texte=terme.capitalize(), cim10_suggestion=code)
|
||||||
|
|
||||||
|
# Patterns courants pour le DP
|
||||||
|
dp_patterns = [
|
||||||
|
r"pancréatite\s+aigu[eë]\s+(?:d'origine\s+)?lithiasique",
|
||||||
|
r"pancréatite\s+aigu[eë]\s+biliaire",
|
||||||
|
r"pancréatite\s+aigu[eë]",
|
||||||
|
]
|
||||||
|
for pat in dp_patterns:
|
||||||
|
if re.search(pat, text_lower):
|
||||||
|
matched = re.search(pat, text_lower).group(0)
|
||||||
|
code = _lookup_cim10(matched)
|
||||||
|
return Diagnostic(texte=matched.capitalize(), cim10_suggestion=code)
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _find_diagnostics_associes(
|
||||||
|
text_lower: str, conclusion: str, dossier: DossierMedical
|
||||||
|
) -> list[Diagnostic]:
|
||||||
|
"""Trouve les diagnostics associés."""
|
||||||
|
das: list[Diagnostic] = []
|
||||||
|
existing_codes = set()
|
||||||
|
if dossier.diagnostic_principal:
|
||||||
|
existing_codes.add(dossier.diagnostic_principal.cim10_suggestion)
|
||||||
|
for d in dossier.diagnostics_associes:
|
||||||
|
existing_codes.add(d.cim10_suggestion)
|
||||||
|
|
||||||
|
# Lithiase cholédoque
|
||||||
|
if re.search(r"lithiase\s+(?:du\s+)?(?:bas\s+)?cholédoque", text_lower):
|
||||||
|
if "K80.5" not in existing_codes:
|
||||||
|
das.append(Diagnostic(texte="Lithiase du cholédoque", cim10_suggestion="K80.5"))
|
||||||
|
existing_codes.add("K80.5")
|
||||||
|
|
||||||
|
# Éruption médicamenteuse
|
||||||
|
if re.search(r"éruption\s+cutanée|eruption\s+cutanée|toxidermie|réaction\s+au\s+tramadol", text_lower):
|
||||||
|
if "L27.0" not in existing_codes:
|
||||||
|
das.append(Diagnostic(texte="Éruption cutanée médicamenteuse", cim10_suggestion="L27.0"))
|
||||||
|
existing_codes.add("L27.0")
|
||||||
|
|
||||||
|
# Obésité (IMC >= 30)
|
||||||
|
if re.search(r"imc\s*[:=]?\s*(\d{2,3}[.,]\d+)", text_lower):
|
||||||
|
m = re.search(r"imc\s*[:=]?\s*(\d{2,3}[.,]\d+)", text_lower)
|
||||||
|
if m:
|
||||||
|
imc_val = float(m.group(1).replace(",", "."))
|
||||||
|
if imc_val >= 30 and "E66.0" not in existing_codes:
|
||||||
|
das.append(Diagnostic(texte=f"Obésité (IMC {imc_val})", cim10_suggestion="E66.0"))
|
||||||
|
existing_codes.add("E66.0")
|
||||||
|
|
||||||
|
# Lithiases vésiculaires
|
||||||
|
if re.search(r"vésicule\s+lithiasique|lithiases?\s+vésiculaire", text_lower):
|
||||||
|
if "K80.2" not in existing_codes:
|
||||||
|
das.append(Diagnostic(texte="Lithiase vésiculaire", cim10_suggestion="K80.2"))
|
||||||
|
existing_codes.add("K80.2")
|
||||||
|
|
||||||
|
return das
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_actes(text: str, dossier: DossierMedical) -> None:
|
||||||
|
"""Extrait les actes CCAM."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# Cholécystectomie par cœlioscopie
|
||||||
|
if re.search(r"chol[ée]cystectomie\s+par\s+c[oœ][ea]lioscopie", text_lower):
|
||||||
|
date = _find_act_date(text, r"chol[ée]cystectomie")
|
||||||
|
dossier.actes_ccam.append(ActeCCAM(
|
||||||
|
texte="Cholécystectomie par cœlioscopie",
|
||||||
|
code_ccam_suggestion="HMFC004",
|
||||||
|
date=date,
|
||||||
|
))
|
||||||
|
elif re.search(r"chol[ée]cystectomie|cholecystectomie", text_lower):
|
||||||
|
date = _find_act_date(text, r"chol[ée]cystectomie|cholecystectomie")
|
||||||
|
dossier.actes_ccam.append(ActeCCAM(
|
||||||
|
texte="Cholécystectomie",
|
||||||
|
code_ccam_suggestion="HMFC004",
|
||||||
|
date=date,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Cholangiographie
|
||||||
|
if re.search(r"cholangiographie", text_lower):
|
||||||
|
date = _find_act_date(text, r"cholangiographie")
|
||||||
|
dossier.actes_ccam.append(ActeCCAM(
|
||||||
|
texte="Cholangiographie peropératoire",
|
||||||
|
code_ccam_suggestion="HHHE002",
|
||||||
|
date=date,
|
||||||
|
))
|
||||||
|
|
||||||
|
# TDM
|
||||||
|
if re.search(r"(?:tdm|scanner|tomodensitométrie)", text_lower):
|
||||||
|
date = _find_act_date(text, r"(?:TDM|scanner)")
|
||||||
|
dossier.actes_ccam.append(ActeCCAM(
|
||||||
|
texte="TDM abdominal",
|
||||||
|
code_ccam_suggestion="ZCQK002",
|
||||||
|
date=date,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_antecedents(text: str, dossier: DossierMedical) -> None:
|
||||||
|
"""Extrait les antécédents."""
|
||||||
|
m = re.search(
|
||||||
|
r"Antécédents?\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Traitements?\s*[::]|Allergie|Histoire de la maladie|Examen clinique|\n\n))",
|
||||||
|
text,
|
||||||
|
re.DOTALL | re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
block = m.group(1).strip()
|
||||||
|
for line in block.split("\n"):
|
||||||
|
line = line.strip().lstrip("- •")
|
||||||
|
# Filtrer les lignes non pertinentes
|
||||||
|
if (line and len(line) > 5 and line != "0"
|
||||||
|
and not re.match(r"^\d", line)
|
||||||
|
and "Item de" not in line
|
||||||
|
and "surveillance" not in line.lower()
|
||||||
|
and "Température" not in line
|
||||||
|
and "Signes Vitaux" not in line
|
||||||
|
and "Pouls" not in line
|
||||||
|
and "Type de note" not in line
|
||||||
|
and "Aucune donnée" not in line
|
||||||
|
and "renseignée" not in line
|
||||||
|
and "habitudes de vie" not in line
|
||||||
|
and "Systolique" not in line
|
||||||
|
and "Diastolique" not in line
|
||||||
|
and "Saturation" not in line):
|
||||||
|
dossier.antecedents.append(line)
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_traitements(
|
||||||
|
parsed: dict,
|
||||||
|
text: str,
|
||||||
|
dossier: DossierMedical,
|
||||||
|
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Extrait les traitements de sortie."""
|
||||||
|
# Construire un index des médicaments edsnlp avec codes ATC
|
||||||
|
drug_atc: dict[str, str] = {}
|
||||||
|
if edsnlp_result:
|
||||||
|
for drug in edsnlp_result.drug_entities:
|
||||||
|
if not drug.negation and drug.code_atc:
|
||||||
|
drug_atc[drug.texte.lower()] = drug.code_atc
|
||||||
|
|
||||||
|
# Depuis le texte — section "TTT de sortie" (limiter à quelques lignes)
|
||||||
|
m = re.search(
|
||||||
|
r"(?:TTT|Traitement)\s+de\s+sortie\s*[::]?\s*\n?(.*?)(?=\n\s*(?:Devenir|Rédigé|Cordialement|Patient:|Episode|Le \d{2}/\d{2}|\n\n)|$)",
|
||||||
|
text,
|
||||||
|
re.DOTALL | re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
block = m.group(1).strip()
|
||||||
|
lines = block.split("\n")
|
||||||
|
for line in lines[:10]: # Limiter à 10 lignes max
|
||||||
|
line = line.strip().lstrip("- •")
|
||||||
|
if not line or len(line) <= 2:
|
||||||
|
continue
|
||||||
|
# Ignorer les footers et lignes non-médicament
|
||||||
|
if re.match(r"^(Patient|Episode|Le \d|Page|V\d)", line):
|
||||||
|
break
|
||||||
|
med = line
|
||||||
|
poso = None
|
||||||
|
# Séparer médicament et posologie
|
||||||
|
poso_match = re.search(r"\s+(si besoin|matin|soir|midi|\d+\s*(?:mg|cp|gel).*)", line, re.IGNORECASE)
|
||||||
|
if poso_match:
|
||||||
|
med = line[:poso_match.start()].strip()
|
||||||
|
poso = poso_match.group(1).strip()
|
||||||
|
# Chercher le code ATC via edsnlp
|
||||||
|
code_atc = _match_drug_atc(med, drug_atc)
|
||||||
|
dossier.traitements_sortie.append(Traitement(
|
||||||
|
medicament=med,
|
||||||
|
posologie=poso,
|
||||||
|
code_atc=code_atc,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Si rien trouvé, chercher les prescriptions "Presc. de Sortie"
|
||||||
|
if not dossier.traitements_sortie:
|
||||||
|
for m_presc in re.finditer(
|
||||||
|
r"([A-ZÉÈÊËÀÂ][A-ZÉÈÊËÀÂ0-9\s\-/%.]+?)(?:\s+\d+\s*(?:mg|G|CPR|GEL))?.*?Presc\.\s*de\s*Sortie",
|
||||||
|
text,
|
||||||
|
):
|
||||||
|
med = m_presc.group(1).strip()
|
||||||
|
if len(med) > 3:
|
||||||
|
code_atc = _match_drug_atc(med, drug_atc)
|
||||||
|
dossier.traitements_sortie.append(Traitement(
|
||||||
|
medicament=med, code_atc=code_atc,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _match_drug_atc(med_name: str, drug_atc: dict[str, str]) -> Optional[str]:
|
||||||
|
"""Cherche un code ATC correspondant au médicament dans les résultats edsnlp."""
|
||||||
|
if not drug_atc:
|
||||||
|
return None
|
||||||
|
med_lower = med_name.lower().strip()
|
||||||
|
# Correspondance exacte
|
||||||
|
if med_lower in drug_atc:
|
||||||
|
return drug_atc[med_lower]
|
||||||
|
# Correspondance partielle : le nom edsnlp est contenu dans le nom du médicament
|
||||||
|
for drug_text, atc in drug_atc.items():
|
||||||
|
if drug_text in med_lower or med_lower in drug_text:
|
||||||
|
return atc
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_biologie(text: str, dossier: DossierMedical) -> None:
|
||||||
|
"""Extrait les résultats biologiques clés."""
|
||||||
|
bio_patterns = [
|
||||||
|
(r"[Ll]ipas[ée]mie\s*(?:[àa=:])?\s*(\d+)", "Lipasémie", None),
|
||||||
|
(r"CRP\s*[=:à]?\s*(\d+(?:[.,]\d+)?)", "CRP", None),
|
||||||
|
(r"ASAT\s*[=:à]?\s*([\d.,]+)\s*(?:N|U/L)?", "ASAT", None),
|
||||||
|
(r"ALAT\s*[=:à]?\s*([\d.,]+)\s*(?:N|U/L)?", "ALAT", None),
|
||||||
|
(r"GGT\s*[=:à]?\s*(\d+)\s*(?:U/L)?", "GGT", None),
|
||||||
|
(r"PAL\s*[=:à]?\s*(\d+)\s*(?:U/L)?", "PAL", None),
|
||||||
|
(r"[Bb]ilirubine\s+(?:totale\s+)?[àa=:]\s*(\d+)\s*(?:µmol/L)?", "Bilirubine totale", None),
|
||||||
|
(r"troponine\s+(négative|positive|normale)", "Troponine", None),
|
||||||
|
]
|
||||||
|
|
||||||
|
for pattern, test_name, _ in bio_patterns:
|
||||||
|
m = re.search(pattern, text)
|
||||||
|
if m:
|
||||||
|
value = m.group(1)
|
||||||
|
anomalie = _is_abnormal(test_name, value)
|
||||||
|
dossier.biologie_cle.append(BiologieCle(
|
||||||
|
test=test_name,
|
||||||
|
valeur=value,
|
||||||
|
anomalie=anomalie,
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_imagerie(text: str, dossier: DossierMedical) -> None:
|
||||||
|
"""Extrait les résultats d'imagerie."""
|
||||||
|
# TDM
|
||||||
|
tdm_match = re.search(
|
||||||
|
r"(?:TDM|[Ss]canner|tomodensitométrie).*?(?:retrouve|montre|objective)\s*[::]?\s*(.*?)(?=\n\s*(?:Cholécystectomie|Au total|Devenir|\n\n))",
|
||||||
|
text,
|
||||||
|
re.DOTALL | re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if tdm_match:
|
||||||
|
conclusion = tdm_match.group(1).strip()
|
||||||
|
# Score de Balthazar
|
||||||
|
score = None
|
||||||
|
m = re.search(r"[Bb]althazar\s*(?:[àa=:])?\s*(\d+|[A-E])", text)
|
||||||
|
if m:
|
||||||
|
score = f"Balthazar {m.group(1)}"
|
||||||
|
dossier.imagerie.append(Imagerie(
|
||||||
|
type="TDM abdominal",
|
||||||
|
conclusion=conclusion[:500],
|
||||||
|
score=score,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Échographie
|
||||||
|
echo_match = re.search(
|
||||||
|
r"(?:[ée]cho(?:graphie)?)\s*.*?(?:retrouve|montre|objective)\s*[::]?\s*(.*?)(?=\n\n)",
|
||||||
|
text,
|
||||||
|
re.DOTALL | re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if echo_match:
|
||||||
|
dossier.imagerie.append(Imagerie(
|
||||||
|
type="Échographie",
|
||||||
|
conclusion=echo_match.group(1).strip()[:500],
|
||||||
|
))
|
||||||
|
|
||||||
|
|
||||||
|
def _extract_complications(
|
||||||
|
text: str,
|
||||||
|
dossier: DossierMedical,
|
||||||
|
edsnlp_result: Optional[EdsnlpResult] = None,
|
||||||
|
) -> None:
|
||||||
|
"""Extrait les complications mentionnées."""
|
||||||
|
text_lower = text.lower()
|
||||||
|
|
||||||
|
# Termes de négation détectés par edsnlp pour chaque entité
|
||||||
|
edsnlp_negated_terms: set[str] = set()
|
||||||
|
if edsnlp_result:
|
||||||
|
for ent in edsnlp_result.cim10_entities:
|
||||||
|
if ent.negation:
|
||||||
|
edsnlp_negated_terms.add(ent.texte.lower())
|
||||||
|
|
||||||
|
complication_terms = [
|
||||||
|
"éruption cutanée",
|
||||||
|
"eruption cutanée",
|
||||||
|
"fièvre",
|
||||||
|
"infection",
|
||||||
|
"hémorragie",
|
||||||
|
"hématome",
|
||||||
|
"abcès",
|
||||||
|
"fistule",
|
||||||
|
"iléus",
|
||||||
|
"occlusion",
|
||||||
|
]
|
||||||
|
|
||||||
|
for term in complication_terms:
|
||||||
|
if term in text_lower:
|
||||||
|
# Vérifier la négation via edsnlp d'abord
|
||||||
|
if edsnlp_result and _is_negated_by_edsnlp(term, edsnlp_negated_terms):
|
||||||
|
continue
|
||||||
|
# Fallback regex pour la négation
|
||||||
|
pattern = rf"(?:pas de|sans|absence de|aucun[e]?)\s+{re.escape(term)}"
|
||||||
|
if not re.search(pattern, text_lower):
|
||||||
|
dossier.complications.append(term.capitalize())
|
||||||
|
|
||||||
|
|
||||||
|
def _is_negated_by_edsnlp(term: str, negated_terms: set[str]) -> bool:
|
||||||
|
"""Vérifie si un terme est nié selon edsnlp."""
|
||||||
|
term_lower = term.lower()
|
||||||
|
for neg_term in negated_terms:
|
||||||
|
if term_lower in neg_term or neg_term in term_lower:
|
||||||
|
return True
|
||||||
|
return False
|
||||||
|
|
||||||
|
|
||||||
|
def _find_act_date(text: str, act_pattern: str) -> str | None:
|
||||||
|
"""Trouve la date associée à un acte."""
|
||||||
|
# Chercher "acte le DD/MM" ou "acte le DD/MM/YYYY"
|
||||||
|
m = re.search(
|
||||||
|
rf"{act_pattern}.*?(?:le\s+)?(\d{{2}}/\d{{2}}(?:/\d{{4}})?)",
|
||||||
|
text,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
|
||||||
|
# Chercher dans la ligne d'observation juste avant
|
||||||
|
m = re.search(
|
||||||
|
rf"(\d{{2}}/\d{{2}}/\d{{4}}).*?{act_pattern}",
|
||||||
|
text,
|
||||||
|
re.IGNORECASE,
|
||||||
|
)
|
||||||
|
if m:
|
||||||
|
return m.group(1)
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _lookup_cim10(text: str) -> str | None:
|
||||||
|
"""Cherche un code CIM-10 pour un texte donné."""
|
||||||
|
text_lower = text.lower().strip()
|
||||||
|
for terme, code in CIM10_MAP.items():
|
||||||
|
if terme in text_lower:
|
||||||
|
return code
|
||||||
|
return None
|
||||||
|
|
||||||
|
|
||||||
|
def _is_abnormal(test: str, value: str) -> bool | None:
|
||||||
|
"""Détermine si un résultat biologique est anormal."""
|
||||||
|
try:
|
||||||
|
val = float(value.replace(",", "."))
|
||||||
|
except (ValueError, AttributeError):
|
||||||
|
if value.lower() in ("négative", "negative", "normale", "normal"):
|
||||||
|
return False
|
||||||
|
if value.lower() in ("positive", "positif", "élevée", "elevee"):
|
||||||
|
return True
|
||||||
|
return None
|
||||||
|
|
||||||
|
normals: dict[str, tuple[float, float]] = {
|
||||||
|
"Lipasémie": (0, 60),
|
||||||
|
"CRP": (0, 5),
|
||||||
|
"ASAT": (0, 40),
|
||||||
|
"ALAT": (0, 40),
|
||||||
|
"GGT": (0, 60),
|
||||||
|
"PAL": (0, 150),
|
||||||
|
"Bilirubine totale": (0, 17),
|
||||||
|
}
|
||||||
|
|
||||||
|
if test in normals:
|
||||||
|
lo, hi = normals[test]
|
||||||
|
return val > hi or val < lo
|
||||||
|
return None
|
||||||
140
src/medical/edsnlp_pipeline.py
Normal file
140
src/medical/edsnlp_pipeline.py
Normal file
@@ -0,0 +1,140 @@
|
|||||||
|
"""Pipeline edsnlp pour l'extraction médicale (CIM-10, médicaments, négation)."""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
import logging
|
||||||
|
from dataclasses import dataclass, field
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
|
logger = logging.getLogger(__name__)
|
||||||
|
|
||||||
|
_nlp = None
|
||||||
|
_available = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class CIM10Entity:
|
||||||
|
texte: str
|
||||||
|
code: str
|
||||||
|
negation: bool = False
|
||||||
|
hypothese: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DrugEntity:
|
||||||
|
texte: str
|
||||||
|
code_atc: Optional[str] = None
|
||||||
|
negation: bool = False
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class DateEntity:
|
||||||
|
texte: str
|
||||||
|
value: Optional[str] = None
|
||||||
|
|
||||||
|
|
||||||
|
@dataclass
|
||||||
|
class EdsnlpResult:
|
||||||
|
cim10_entities: list[CIM10Entity] = field(default_factory=list)
|
||||||
|
drug_entities: list[DrugEntity] = field(default_factory=list)
|
||||||
|
date_entities: list[DateEntity] = field(default_factory=list)
|
||||||
|
|
||||||
|
|
||||||
|
def is_available() -> bool:
|
||||||
|
"""Vérifie si edsnlp est installé et utilisable."""
|
||||||
|
global _available
|
||||||
|
if _available is not None:
|
||||||
|
return _available
|
||||||
|
try:
|
||||||
|
import edsnlp # noqa: F401
|
||||||
|
_available = True
|
||||||
|
except ImportError:
|
||||||
|
_available = False
|
||||||
|
return _available
|
||||||
|
|
||||||
|
|
||||||
|
def get_pipeline():
|
||||||
|
"""Retourne le pipeline edsnlp (singleton lazy-loaded)."""
|
||||||
|
global _nlp
|
||||||
|
if _nlp is not None:
|
||||||
|
return _nlp
|
||||||
|
|
||||||
|
if not is_available():
|
||||||
|
raise RuntimeError("edsnlp n'est pas installé")
|
||||||
|
|
||||||
|
import edsnlp
|
||||||
|
|
||||||
|
logger.info("Initialisation du pipeline edsnlp...")
|
||||||
|
nlp = edsnlp.blank("eds")
|
||||||
|
|
||||||
|
nlp.add_pipe("eds.normalizer")
|
||||||
|
nlp.add_pipe("eds.sentences")
|
||||||
|
nlp.add_pipe("eds.cim10", config=dict(attr="NORM", term_matcher="simstring"))
|
||||||
|
nlp.add_pipe("eds.drugs", config=dict(attr="NORM", term_matcher="exact"))
|
||||||
|
nlp.add_pipe("eds.negation")
|
||||||
|
nlp.add_pipe("eds.hypothesis")
|
||||||
|
nlp.add_pipe("eds.dates")
|
||||||
|
|
||||||
|
_nlp = nlp
|
||||||
|
logger.info("Pipeline edsnlp initialisé avec succès")
|
||||||
|
return _nlp
|
||||||
|
|
||||||
|
|
||||||
|
def analyze(text: str) -> EdsnlpResult:
|
||||||
|
"""Analyse un texte médical avec edsnlp.
|
||||||
|
|
||||||
|
Retourne les entités CIM-10, médicaments et dates détectées.
|
||||||
|
"""
|
||||||
|
result = EdsnlpResult()
|
||||||
|
|
||||||
|
if not is_available():
|
||||||
|
return result
|
||||||
|
|
||||||
|
try:
|
||||||
|
nlp = get_pipeline()
|
||||||
|
doc = nlp(text)
|
||||||
|
except Exception:
|
||||||
|
logger.exception("Erreur lors de l'analyse edsnlp")
|
||||||
|
return result
|
||||||
|
|
||||||
|
for ent in doc.ents:
|
||||||
|
negation = getattr(ent._, "negation", False) or False
|
||||||
|
hypothese = getattr(ent._, "hypothesis", False) or False
|
||||||
|
|
||||||
|
if ent.label_ == "cim10":
|
||||||
|
code = ent.kb_id_ or ""
|
||||||
|
if code:
|
||||||
|
result.cim10_entities.append(CIM10Entity(
|
||||||
|
texte=ent.text,
|
||||||
|
code=code,
|
||||||
|
negation=negation,
|
||||||
|
hypothese=hypothese,
|
||||||
|
))
|
||||||
|
elif ent.label_ == "drug":
|
||||||
|
code_atc = ent.kb_id_ or None
|
||||||
|
result.drug_entities.append(DrugEntity(
|
||||||
|
texte=ent.text,
|
||||||
|
code_atc=code_atc,
|
||||||
|
negation=negation,
|
||||||
|
))
|
||||||
|
|
||||||
|
# Dates
|
||||||
|
for span in doc.spans.get("dates", []):
|
||||||
|
date_value = None
|
||||||
|
if hasattr(span._, "date"):
|
||||||
|
date_obj = span._.date
|
||||||
|
if date_obj is not None:
|
||||||
|
date_value = str(date_obj)
|
||||||
|
result.date_entities.append(DateEntity(
|
||||||
|
texte=span.text,
|
||||||
|
value=date_value,
|
||||||
|
))
|
||||||
|
|
||||||
|
return result
|
||||||
|
|
||||||
|
|
||||||
|
def reset():
|
||||||
|
"""Réinitialise le pipeline (utile pour les tests)."""
|
||||||
|
global _nlp, _available
|
||||||
|
_nlp = None
|
||||||
|
_available = None
|
||||||
0
tests/__init__.py
Normal file
0
tests/__init__.py
Normal file
197
tests/test_anonymization.py
Normal file
197
tests/test_anonymization.py
Normal file
@@ -0,0 +1,197 @@
|
|||||||
|
"""Tests pour le module d'anonymisation."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.anonymization.entity_registry import EntityRegistry
|
||||||
|
from src.anonymization.regex_patterns import (
|
||||||
|
CRH_FOOTER_IPP_EPISODE,
|
||||||
|
CRH_FOOTER_PATIENT_PATTERN,
|
||||||
|
DATE_NAISSANCE_PATTERN,
|
||||||
|
DR_NAME_PATTERN,
|
||||||
|
EMAIL_PATTERN,
|
||||||
|
EPISODE_PATTERN,
|
||||||
|
FOOTER_PATIENT_PATTERN,
|
||||||
|
IPP_PATTERN,
|
||||||
|
NOTE_AUTHOR_PATTERN,
|
||||||
|
PHONE_PATTERN,
|
||||||
|
RPPS_PATTERN,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestRegexPatterns:
|
||||||
|
def test_ipp_with_colon(self):
|
||||||
|
m = IPP_PATTERN.search("IPP: 01306172")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "01306172"
|
||||||
|
|
||||||
|
def test_ipp_without_colon(self):
|
||||||
|
m = IPP_PATTERN.search("IPP 01306172")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "01306172"
|
||||||
|
|
||||||
|
def test_ipp_in_parentheses(self):
|
||||||
|
m = IPP_PATTERN.search("(01306172 )")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(2) == "01306172"
|
||||||
|
|
||||||
|
def test_episode_no(self):
|
||||||
|
m = EPISODE_PATTERN.search("Episode No: 23042753")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "23042753"
|
||||||
|
|
||||||
|
def test_episode_n_degree(self):
|
||||||
|
m = EPISODE_PATTERN.search("N° Episode 23042753")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(2) == "23042753"
|
||||||
|
|
||||||
|
def test_phone_dots(self):
|
||||||
|
m = PHONE_PATTERN.search("06.25.39.26.82")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(0) == "06.25.39.26.82"
|
||||||
|
|
||||||
|
def test_phone_spaces(self):
|
||||||
|
m = PHONE_PATTERN.search("05 59 44 35 35")
|
||||||
|
assert m is not None
|
||||||
|
|
||||||
|
def test_email(self):
|
||||||
|
m = EMAIL_PATTERN.search("faudemar@ch-cotebasque.fr")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(0) == "faudemar@ch-cotebasque.fr"
|
||||||
|
|
||||||
|
def test_rpps(self):
|
||||||
|
m = RPPS_PATTERN.search("RPPS : 10100532760")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "10100532760"
|
||||||
|
|
||||||
|
def test_date_naissance_nee_le(self):
|
||||||
|
m = DATE_NAISSANCE_PATTERN.search("née le 23/02/1980")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "23/02/1980"
|
||||||
|
|
||||||
|
def test_date_naissance_ne_e_le(self):
|
||||||
|
m = DATE_NAISSANCE_PATTERN.search("Né(e) le 23/02/1980")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "23/02/1980"
|
||||||
|
|
||||||
|
def test_date_naissance_field(self):
|
||||||
|
m = DATE_NAISSANCE_PATTERN.search("Date de naissance: 23/02/1980")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "23/02/1980"
|
||||||
|
|
||||||
|
def test_dr_name(self):
|
||||||
|
m = DR_NAME_PATTERN.search("Dr F. AUDEMAR")
|
||||||
|
assert m is not None
|
||||||
|
assert "AUDEMAR" in m.group(1)
|
||||||
|
|
||||||
|
def test_dr_name_docteur(self):
|
||||||
|
m = DR_NAME_PATTERN.search("Docteur AUDEMAR Franck")
|
||||||
|
assert m is not None
|
||||||
|
assert "AUDEMAR" in m.group(1)
|
||||||
|
|
||||||
|
def test_note_author(self):
|
||||||
|
m = NOTE_AUTHOR_PATTERN.search("Note IDE Annie GUIRESSE Non algique")
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "Annie GUIRESSE"
|
||||||
|
|
||||||
|
def test_footer_patient_trackare(self):
|
||||||
|
m = FOOTER_PATIENT_PATTERN.search(
|
||||||
|
"Patient: CLIER NARBAIS AUDREY - Date de naissance: 23/02/1980"
|
||||||
|
)
|
||||||
|
assert m is not None
|
||||||
|
assert "CLIER" in m.group(1)
|
||||||
|
|
||||||
|
def test_crh_footer_patient(self):
|
||||||
|
m = CRH_FOOTER_PATIENT_PATTERN.search(
|
||||||
|
"Patient(e) : CLIER AUDREY NARBAIS Né(e) le 23/02/1980"
|
||||||
|
)
|
||||||
|
assert m is not None
|
||||||
|
assert "CLIER" in m.group(1)
|
||||||
|
|
||||||
|
def test_crh_footer_ipp_episode(self):
|
||||||
|
m = CRH_FOOTER_IPP_EPISODE.search(
|
||||||
|
"IPP 01306172 / N° Episode 23042753 (MEDECINE GASTRO B2 HC)"
|
||||||
|
)
|
||||||
|
assert m is not None
|
||||||
|
assert m.group(1) == "01306172"
|
||||||
|
assert m.group(2) == "23042753"
|
||||||
|
|
||||||
|
|
||||||
|
class TestEntityRegistry:
|
||||||
|
def test_register_returns_pseudo(self):
|
||||||
|
reg = EntityRegistry()
|
||||||
|
pseudo = reg.register("Jean Dupont", "patient")
|
||||||
|
assert pseudo == "[PATIENT_1]"
|
||||||
|
|
||||||
|
def test_register_same_entity_returns_same(self):
|
||||||
|
reg = EntityRegistry()
|
||||||
|
p1 = reg.register("Jean Dupont", "patient")
|
||||||
|
p2 = reg.register("Jean Dupont", "patient")
|
||||||
|
assert p1 == p2
|
||||||
|
|
||||||
|
def test_register_case_insensitive(self):
|
||||||
|
reg = EntityRegistry()
|
||||||
|
p1 = reg.register("Jean DUPONT", "patient")
|
||||||
|
p2 = reg.register("jean dupont", "patient")
|
||||||
|
assert p1 == p2
|
||||||
|
|
||||||
|
def test_register_different_categories(self):
|
||||||
|
reg = EntityRegistry()
|
||||||
|
p1 = reg.register("Dupont", "patient")
|
||||||
|
p2 = reg.register("Martin", "medecin")
|
||||||
|
assert p1 == "[PATIENT_1]"
|
||||||
|
assert p2 == "[MEDECIN_1]"
|
||||||
|
|
||||||
|
def test_get_replacement(self):
|
||||||
|
reg = EntityRegistry()
|
||||||
|
reg.register("Jean Dupont", "patient")
|
||||||
|
assert reg.get_replacement("jean dupont") == "[PATIENT_1]"
|
||||||
|
assert reg.get_replacement("inconnu") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestAnonymizer:
|
||||||
|
def test_anonymize_basic(self):
|
||||||
|
from src.anonymization.anonymizer import Anonymizer
|
||||||
|
|
||||||
|
parsed = {
|
||||||
|
"patient": {"nom_prenom": "DUPONT Jean", "nom_naissance": "DUPONT"},
|
||||||
|
"medecins": ["MARTIN Pierre"],
|
||||||
|
"contacts": [],
|
||||||
|
}
|
||||||
|
anonymizer = Anonymizer(parsed_data=parsed)
|
||||||
|
text = "Le patient DUPONT Jean a été vu par Dr MARTIN Pierre."
|
||||||
|
result = anonymizer.anonymize(text)
|
||||||
|
|
||||||
|
assert "DUPONT" not in result
|
||||||
|
assert "MARTIN" not in result
|
||||||
|
assert "[PATIENT" in result or "[MEDECIN" in result
|
||||||
|
|
||||||
|
def test_preserves_medical_content(self):
|
||||||
|
from src.anonymization.anonymizer import Anonymizer
|
||||||
|
|
||||||
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
||||||
|
text = "Pancréatite aiguë biliaire. Cholécystectomie par cœlioscopie. IMC 34.37."
|
||||||
|
result = anonymizer.anonymize(text)
|
||||||
|
|
||||||
|
assert "Pancréatite" in result
|
||||||
|
assert "Cholécystectomie" in result
|
||||||
|
assert "IMC" in result
|
||||||
|
|
||||||
|
def test_anonymize_phone(self):
|
||||||
|
from src.anonymization.anonymizer import Anonymizer
|
||||||
|
|
||||||
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
||||||
|
text = "Appeler le 06.25.39.26.82 pour le rendez-vous."
|
||||||
|
result = anonymizer.anonymize(text)
|
||||||
|
|
||||||
|
assert "06.25.39.26.82" not in result
|
||||||
|
assert "[TEL" in result
|
||||||
|
|
||||||
|
def test_anonymize_email(self):
|
||||||
|
from src.anonymization.anonymizer import Anonymizer
|
||||||
|
|
||||||
|
anonymizer = Anonymizer(parsed_data={"patient": {}, "medecins": [], "contacts": []})
|
||||||
|
text = "Contact: faudemar@ch-cotebasque.fr"
|
||||||
|
result = anonymizer.anonymize(text)
|
||||||
|
|
||||||
|
assert "faudemar@ch-cotebasque.fr" not in result
|
||||||
|
assert "[EMAIL" in result
|
||||||
126
tests/test_extraction.py
Normal file
126
tests/test_extraction.py
Normal file
@@ -0,0 +1,126 @@
|
|||||||
|
"""Tests pour le module d'extraction."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.extraction.document_classifier import classify
|
||||||
|
from src.extraction.crh_parser import parse_crh
|
||||||
|
from src.extraction.trackare_parser import parse_trackare, _clean_person_name
|
||||||
|
|
||||||
|
|
||||||
|
class TestDocumentClassifier:
|
||||||
|
def test_classify_trackare(self):
|
||||||
|
text = """CENTRE HOSPITALIER COTE BASQUE
|
||||||
|
Dossier Patient
|
||||||
|
Détails des patients
|
||||||
|
Nom de naissance: CLIER IPP: 01306172
|
||||||
|
Détails épisode
|
||||||
|
Episode No: 23042753
|
||||||
|
Signes Vitaux"""
|
||||||
|
assert classify(text) == "trackare"
|
||||||
|
|
||||||
|
def test_classify_crh(self):
|
||||||
|
text = """N° Finess CENTRE HOSPITALIER COTE BASQUE
|
||||||
|
Pôle Spécialités Médicales
|
||||||
|
Service de Gastro-Entérologie
|
||||||
|
Mon cher confrère,
|
||||||
|
Votre patiente a été hospitalisée"""
|
||||||
|
assert classify(text) == "crh"
|
||||||
|
|
||||||
|
def test_classify_trackare_by_ipp(self):
|
||||||
|
text = "IPP: 12345678 Episode No: 87654321"
|
||||||
|
assert classify(text) == "trackare"
|
||||||
|
|
||||||
|
|
||||||
|
class TestCRHParser:
|
||||||
|
def test_parse_patient_info(self):
|
||||||
|
text = """MME NARBAIS AUDREY
|
||||||
|
MAISON IRREXELAIA
|
||||||
|
64430 ST ETIENNE DE BAIGORRY
|
||||||
|
|
||||||
|
Mon cher confrère,
|
||||||
|
Votre patiente NARBAIS Audrey née le 23/02/1980 a été hospitalisée
|
||||||
|
du 25/02/2023 au 03/03/2023 pour le motif suivant:
|
||||||
|
Pancréatite aiguë lithiasique"""
|
||||||
|
result = parse_crh(text)
|
||||||
|
|
||||||
|
assert result["patient"]["nom_complet"] == "NARBAIS AUDREY"
|
||||||
|
assert result["patient"]["sexe"] == "F"
|
||||||
|
assert result["patient"]["date_naissance"] == "23/02/1980"
|
||||||
|
|
||||||
|
def test_parse_sejour(self):
|
||||||
|
text = """Votre patiente née le 23/02/1980 a été hospitalisée
|
||||||
|
du 25/02/2023 au 03/03/2023 pour le motif suivant:
|
||||||
|
Pancréatite aiguë"""
|
||||||
|
result = parse_crh(text)
|
||||||
|
|
||||||
|
assert result["sejour"]["date_entree"] == "25/02/2023"
|
||||||
|
assert result["sejour"]["date_sortie"] == "03/03/2023"
|
||||||
|
|
||||||
|
def test_parse_medecins(self):
|
||||||
|
text = "Dr PUJOS. Dr F. AUDEMAR. Docteur DUTREY Sarah."
|
||||||
|
result = parse_crh(text)
|
||||||
|
|
||||||
|
assert any("PUJOS" in m for m in result["medecins"])
|
||||||
|
assert any("AUDEMAR" in m for m in result["medecins"])
|
||||||
|
|
||||||
|
|
||||||
|
class TestTrackareParser:
|
||||||
|
def test_parse_patient_info(self):
|
||||||
|
text = """Nom de naissance: CLIER IPP: 01306172
|
||||||
|
Nom et Prénom: NARBAIS AUDREY Date de naissance: 23/02/1980
|
||||||
|
Sexe: Féminin Lieu de naissance: CHAMPIGNY SUR MARNE
|
||||||
|
Adresse: MAISON IRREXELAIA QUARTIER AUZO TTIPI Ville de résidence: ST ETIENNE DE BAIGORRY
|
||||||
|
Code Postal: 64430
|
||||||
|
Episode No: 23042753
|
||||||
|
Date d'admission: 25/02/2023 Heure d'admission: 03:07
|
||||||
|
Date de sortie: 03/03/2023
|
||||||
|
Taille: 162 cm - Poids: 90.2 kg - IMC: 34.370"""
|
||||||
|
result = parse_trackare(text)
|
||||||
|
|
||||||
|
assert result["patient"]["nom_naissance"] == "CLIER"
|
||||||
|
assert result["patient"]["nom_prenom"] == "NARBAIS AUDREY"
|
||||||
|
assert result["patient"]["ipp"] == "01306172"
|
||||||
|
assert result["patient"]["sexe"] == "F"
|
||||||
|
assert result["patient"]["date_naissance"] == "23/02/1980"
|
||||||
|
assert result["patient"]["imc"] == 34.370
|
||||||
|
assert result["sejour"]["episode"] == "23042753"
|
||||||
|
assert result["sejour"]["date_entree"] == "25/02/2023"
|
||||||
|
|
||||||
|
def test_parse_diagnostics(self):
|
||||||
|
text = """Diagnostic aux urgences
|
||||||
|
Type Etat Code Date
|
||||||
|
Principal actif K80.5 Calcul des canaux biliaires (sans angiocholite ni cholécystite) [CMA2] 25/02/2023 05:27"""
|
||||||
|
result = parse_trackare(text)
|
||||||
|
|
||||||
|
assert len(result["diagnostics"]) >= 1
|
||||||
|
assert result["diagnostics"][0]["code_cim10"] == "K80.5"
|
||||||
|
assert result["diagnostics"][0]["type"] == "Principal"
|
||||||
|
|
||||||
|
def test_parse_vitals(self):
|
||||||
|
text = """Poids/Taille
|
||||||
|
Taille [cm] 162,00
|
||||||
|
Poids [kg] 90,20
|
||||||
|
Indice
|
||||||
|
de masse 34.370"""
|
||||||
|
result = parse_trackare(text)
|
||||||
|
|
||||||
|
assert result["signes_vitaux"]["taille_cm"] == 162.0
|
||||||
|
assert result["signes_vitaux"]["poids_kg"] >= 90.0
|
||||||
|
assert result["signes_vitaux"]["imc"] == 34.370
|
||||||
|
|
||||||
|
|
||||||
|
class TestCleanPersonName:
|
||||||
|
def test_clean_simple(self):
|
||||||
|
assert _clean_person_name("Sarah DUTREY") == "Sarah DUTREY"
|
||||||
|
|
||||||
|
def test_clean_with_noise(self):
|
||||||
|
assert _clean_person_name("Sarah DUTREY une complication") == "Sarah DUTREY"
|
||||||
|
|
||||||
|
def test_clean_multiline(self):
|
||||||
|
assert _clean_person_name("Sarah\nDUTREY") == "Sarah DUTREY"
|
||||||
|
|
||||||
|
def test_clean_medical_term(self):
|
||||||
|
assert _clean_person_name("Bilirubine") == ""
|
||||||
|
|
||||||
|
def test_clean_empty(self):
|
||||||
|
assert _clean_person_name("") == ""
|
||||||
124
tests/test_integration.py
Normal file
124
tests/test_integration.py
Normal file
@@ -0,0 +1,124 @@
|
|||||||
|
"""Tests d'intégration end-to-end sur les PDFs réels."""
|
||||||
|
|
||||||
|
import json
|
||||||
|
from pathlib import Path
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.config import INPUT_DIR, ANONYMIZED_DIR, STRUCTURED_DIR
|
||||||
|
from src.main import process_pdf
|
||||||
|
|
||||||
|
|
||||||
|
# Skip si les PDFs ne sont pas disponibles
|
||||||
|
CRH_PDF = INPUT_DIR / "CRH 23042753.pdf"
|
||||||
|
TRACKARE_PDF = INPUT_DIR / "trackare-01306172-23042753_01306172_23042753.pdf"
|
||||||
|
|
||||||
|
needs_pdfs = pytest.mark.skipif(
|
||||||
|
not CRH_PDF.exists() or not TRACKARE_PDF.exists(),
|
||||||
|
reason="PDFs de test non disponibles dans input/",
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
# Données personnelles connues à vérifier
|
||||||
|
PATIENT_PII = [
|
||||||
|
"NARBAIS", "CLIER", "AUDREY", "01306172", "23042753",
|
||||||
|
"23/02/1980", "IRREXELAIA", "BAIGORRY", "06.25.39.26.82",
|
||||||
|
]
|
||||||
|
|
||||||
|
SOIGNANT_NAMES = [
|
||||||
|
"DUTREY", "MENDIBOURE", "PUJOS", "AUDEMAR", "MELLIN",
|
||||||
|
"GUIRESSE", "GOYTINO", "SERRE", "NOVION",
|
||||||
|
]
|
||||||
|
|
||||||
|
|
||||||
|
@needs_pdfs
|
||||||
|
class TestCRHIntegration:
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup(self):
|
||||||
|
self.anonymized, self.dossier, self.report = process_pdf(CRH_PDF)
|
||||||
|
|
||||||
|
def test_no_patient_pii(self):
|
||||||
|
text_upper = self.anonymized.upper()
|
||||||
|
for pii in PATIENT_PII:
|
||||||
|
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
|
||||||
|
|
||||||
|
def test_medical_content_preserved(self):
|
||||||
|
text_lower = self.anonymized.lower()
|
||||||
|
for term in ["pancréatite", "cholécystectomie", "cholangiographie", "lithiase"]:
|
||||||
|
assert term in text_lower, f"Terme médical manquant : {term}"
|
||||||
|
|
||||||
|
def test_diagnostic_principal(self):
|
||||||
|
dp = self.dossier.diagnostic_principal
|
||||||
|
assert dp is not None
|
||||||
|
assert dp.cim10_suggestion == "K85.1"
|
||||||
|
|
||||||
|
def test_diagnostics_associes(self):
|
||||||
|
codes = {d.cim10_suggestion for d in self.dossier.diagnostics_associes}
|
||||||
|
assert "K80.5" in codes or "K80.2" in codes
|
||||||
|
|
||||||
|
def test_sejour(self):
|
||||||
|
s = self.dossier.sejour
|
||||||
|
assert s.sexe == "F"
|
||||||
|
assert s.age == 43
|
||||||
|
assert s.date_entree == "25/02/2023"
|
||||||
|
assert s.date_sortie == "03/03/2023"
|
||||||
|
assert s.duree_sejour == 6
|
||||||
|
|
||||||
|
def test_traitements_have_optional_atc(self):
|
||||||
|
"""Vérifie que les traitements ont le champ code_atc (peut être None)."""
|
||||||
|
for t in self.dossier.traitements_sortie:
|
||||||
|
assert hasattr(t, "code_atc")
|
||||||
|
|
||||||
|
|
||||||
|
@needs_pdfs
|
||||||
|
class TestTrackareIntegration:
|
||||||
|
@pytest.fixture(autouse=True)
|
||||||
|
def setup(self):
|
||||||
|
self.anonymized, self.dossier, self.report = process_pdf(TRACKARE_PDF)
|
||||||
|
|
||||||
|
def test_no_patient_pii(self):
|
||||||
|
text_upper = self.anonymized.upper()
|
||||||
|
for pii in PATIENT_PII:
|
||||||
|
assert pii.upper() not in text_upper, f"PII trouvé : {pii}"
|
||||||
|
|
||||||
|
def test_no_soignant_names(self):
|
||||||
|
# Ignorer "selles" qui contient "SELLE"
|
||||||
|
text = self.anonymized
|
||||||
|
for name in SOIGNANT_NAMES:
|
||||||
|
# Chercher le nom comme mot complet
|
||||||
|
import re
|
||||||
|
pattern = re.compile(r"\b" + re.escape(name) + r"\b", re.IGNORECASE)
|
||||||
|
matches = pattern.findall(text)
|
||||||
|
assert len(matches) == 0, f"Nom soignant trouvé : {name} ({len(matches)} occurrences)"
|
||||||
|
|
||||||
|
def test_medical_content_preserved(self):
|
||||||
|
text_lower = self.anonymized.lower()
|
||||||
|
for term in ["pancréatite", "cholécystectomie", "morphine", "paracétamol"]:
|
||||||
|
assert term in text_lower, f"Terme médical manquant : {term}"
|
||||||
|
|
||||||
|
def test_diagnostic_principal(self):
|
||||||
|
dp = self.dossier.diagnostic_principal
|
||||||
|
assert dp is not None
|
||||||
|
assert dp.cim10_suggestion in ("K80.5", "K85.1")
|
||||||
|
|
||||||
|
def test_sejour_with_vitals(self):
|
||||||
|
s = self.dossier.sejour
|
||||||
|
assert s.sexe == "F"
|
||||||
|
assert s.age == 43
|
||||||
|
assert s.imc is not None
|
||||||
|
assert s.imc > 30
|
||||||
|
assert s.poids is not None
|
||||||
|
assert s.taille is not None
|
||||||
|
|
||||||
|
def test_biologie(self):
|
||||||
|
tests = {b.test for b in self.dossier.biologie_cle}
|
||||||
|
assert "Lipasémie" in tests
|
||||||
|
assert "CRP" in tests
|
||||||
|
# Lipasémie doit être anormale
|
||||||
|
for b in self.dossier.biologie_cle:
|
||||||
|
if b.test == "Lipasémie":
|
||||||
|
assert b.anomalie is True
|
||||||
|
|
||||||
|
def test_report_counts(self):
|
||||||
|
assert self.report.total_replacements > 100
|
||||||
|
assert self.report.regex_replacements > 50
|
||||||
238
tests/test_medical.py
Normal file
238
tests/test_medical.py
Normal file
@@ -0,0 +1,238 @@
|
|||||||
|
"""Tests pour le module d'extraction médicale CIM-10."""
|
||||||
|
|
||||||
|
import pytest
|
||||||
|
|
||||||
|
from src.config import DossierMedical, Diagnostic
|
||||||
|
from src.medical.cim10_extractor import (
|
||||||
|
extract_medical_info,
|
||||||
|
_lookup_cim10,
|
||||||
|
_is_abnormal,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class TestCIM10Lookup:
|
||||||
|
def test_pancréatite_lithiasique(self):
|
||||||
|
assert _lookup_cim10("pancréatite aiguë lithiasique") == "K85.1"
|
||||||
|
|
||||||
|
def test_lithiase_choledoque(self):
|
||||||
|
assert _lookup_cim10("lithiase du cholédoque") == "K80.5"
|
||||||
|
|
||||||
|
def test_eruption_medicamenteuse(self):
|
||||||
|
assert _lookup_cim10("éruption médicamenteuse") == "L27.0"
|
||||||
|
|
||||||
|
def test_obesite(self):
|
||||||
|
assert _lookup_cim10("obésité") == "E66.0"
|
||||||
|
|
||||||
|
def test_unknown(self):
|
||||||
|
assert _lookup_cim10("grippe") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestIsAbnormal:
|
||||||
|
def test_lipasemie_high(self):
|
||||||
|
assert _is_abnormal("Lipasémie", "6000") is True
|
||||||
|
|
||||||
|
def test_crp_normal(self):
|
||||||
|
assert _is_abnormal("CRP", "3") is False
|
||||||
|
|
||||||
|
def test_crp_high(self):
|
||||||
|
assert _is_abnormal("CRP", "12") is True
|
||||||
|
|
||||||
|
def test_troponine_negative(self):
|
||||||
|
assert _is_abnormal("Troponine", "négative") is False
|
||||||
|
|
||||||
|
def test_unknown_test(self):
|
||||||
|
assert _is_abnormal("TestInconnu", "42") is None
|
||||||
|
|
||||||
|
|
||||||
|
class TestExtractMedicalInfo:
|
||||||
|
def test_extract_from_trackare(self):
|
||||||
|
parsed = {
|
||||||
|
"type": "trackare",
|
||||||
|
"patient": {
|
||||||
|
"sexe": "F",
|
||||||
|
"date_naissance": "23/02/1980",
|
||||||
|
"imc": 34.37,
|
||||||
|
"poids_kg": 90.2,
|
||||||
|
"taille_cm": 162,
|
||||||
|
},
|
||||||
|
"sejour": {
|
||||||
|
"date_entree": "25/02/2023",
|
||||||
|
"date_sortie": "03/03/2023",
|
||||||
|
},
|
||||||
|
"urgences": {"mode_entree": "Urgences"},
|
||||||
|
"diagnostics": [
|
||||||
|
{
|
||||||
|
"type": "Principal",
|
||||||
|
"statut": "actif",
|
||||||
|
"code_cim10": "K80.5",
|
||||||
|
"libelle": "Calcul des canaux biliaires",
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"signes_vitaux": {"imc": 34.37, "poids_kg": 90.2, "taille_cm": 162},
|
||||||
|
}
|
||||||
|
text = """Pancréatite aiguë lithiasique.
|
||||||
|
Cholécystectomie par cœlioscopie le 01/03.
|
||||||
|
Cholangiographie retrouvant une lithiase du bas cholédoque.
|
||||||
|
TDM à J3 retrouve : Absence de signe de gravité. Score de Balthazar à 0.
|
||||||
|
Éruption cutanée érythémateuse. Réaction au tramadol.
|
||||||
|
IMC: 34.370
|
||||||
|
TTT de sortie :
|
||||||
|
Paracétamol et Acupan si besoin
|
||||||
|
Cétirizine
|
||||||
|
|
||||||
|
Devenir : sortie le 03/03."""
|
||||||
|
|
||||||
|
dossier = extract_medical_info(parsed, text)
|
||||||
|
|
||||||
|
# Séjour
|
||||||
|
assert dossier.sejour.sexe == "F"
|
||||||
|
assert dossier.sejour.age == 43
|
||||||
|
assert dossier.sejour.duree_sejour == 6
|
||||||
|
assert dossier.sejour.imc == 34.37
|
||||||
|
|
||||||
|
# DP
|
||||||
|
assert dossier.diagnostic_principal is not None
|
||||||
|
assert dossier.diagnostic_principal.cim10_suggestion == "K80.5"
|
||||||
|
|
||||||
|
# DAS
|
||||||
|
codes = {d.cim10_suggestion for d in dossier.diagnostics_associes}
|
||||||
|
assert "L27.0" in codes # Éruption médicamenteuse
|
||||||
|
assert "E66.0" in codes # Obésité
|
||||||
|
|
||||||
|
# Actes
|
||||||
|
acte_codes = {a.code_ccam_suggestion for a in dossier.actes_ccam}
|
||||||
|
assert "HMFC004" in acte_codes # Cholécystectomie
|
||||||
|
assert "ZCQK002" in acte_codes # TDM
|
||||||
|
|
||||||
|
# Traitements
|
||||||
|
meds = [t.medicament for t in dossier.traitements_sortie]
|
||||||
|
assert any("Paracétamol" in m for m in meds)
|
||||||
|
assert any("Cétirizine" in m for m in meds)
|
||||||
|
|
||||||
|
# Bio
|
||||||
|
tests = {b.test for b in dossier.biologie_cle}
|
||||||
|
assert "Troponine" not in tests # pas dans ce texte minimal
|
||||||
|
|
||||||
|
# Imagerie
|
||||||
|
assert len(dossier.imagerie) >= 1
|
||||||
|
assert any("Balthazar" in (i.score or "") for i in dossier.imagerie)
|
||||||
|
|
||||||
|
# Complications
|
||||||
|
assert any("cutanée" in c.lower() for c in dossier.complications)
|
||||||
|
|
||||||
|
def test_extract_without_edsnlp(self):
|
||||||
|
"""Vérifie que l'extraction fonctionne sans résultat edsnlp."""
|
||||||
|
parsed = {
|
||||||
|
"type": "crh",
|
||||||
|
"patient": {"sexe": "M"},
|
||||||
|
"sejour": {},
|
||||||
|
"diagnostics": [],
|
||||||
|
}
|
||||||
|
text = "Pancréatite aiguë biliaire.\nTTT de sortie :\nParacétamol 1g matin et soir\n\nDevenir : retour."
|
||||||
|
|
||||||
|
dossier = extract_medical_info(parsed, text, edsnlp_result=None)
|
||||||
|
assert dossier.diagnostic_principal is not None
|
||||||
|
assert dossier.diagnostic_principal.cim10_suggestion == "K85.1"
|
||||||
|
assert len(dossier.traitements_sortie) >= 1
|
||||||
|
|
||||||
|
def test_extract_with_edsnlp_result(self):
|
||||||
|
"""Vérifie que les résultats edsnlp enrichissent les diagnostics."""
|
||||||
|
from src.medical.edsnlp_pipeline import EdsnlpResult, CIM10Entity, DrugEntity
|
||||||
|
|
||||||
|
parsed = {
|
||||||
|
"type": "crh",
|
||||||
|
"patient": {"sexe": "M"},
|
||||||
|
"sejour": {},
|
||||||
|
"diagnostics": [],
|
||||||
|
}
|
||||||
|
text = "Patient admis pour douleur abdominale."
|
||||||
|
|
||||||
|
edsnlp_result = EdsnlpResult(
|
||||||
|
cim10_entities=[
|
||||||
|
CIM10Entity(texte="douleur abdominale", code="R10.4", negation=False),
|
||||||
|
],
|
||||||
|
drug_entities=[],
|
||||||
|
)
|
||||||
|
|
||||||
|
dossier = extract_medical_info(parsed, text, edsnlp_result=edsnlp_result)
|
||||||
|
# Le DP devrait être trouvé via edsnlp
|
||||||
|
assert dossier.diagnostic_principal is not None
|
||||||
|
assert dossier.diagnostic_principal.cim10_suggestion == "R10.4"
|
||||||
|
|
||||||
|
def test_negated_edsnlp_entities_ignored(self):
|
||||||
|
"""Vérifie que les entités niées par edsnlp ne sont pas retenues."""
|
||||||
|
from src.medical.edsnlp_pipeline import EdsnlpResult, CIM10Entity
|
||||||
|
|
||||||
|
parsed = {
|
||||||
|
"type": "crh",
|
||||||
|
"patient": {"sexe": "M"},
|
||||||
|
"sejour": {},
|
||||||
|
"diagnostics": [],
|
||||||
|
}
|
||||||
|
text = "Pas de fièvre. Patient en bon état."
|
||||||
|
|
||||||
|
edsnlp_result = EdsnlpResult(
|
||||||
|
cim10_entities=[
|
||||||
|
CIM10Entity(texte="fièvre", code="R50.9", negation=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
dossier = extract_medical_info(parsed, text, edsnlp_result=edsnlp_result)
|
||||||
|
# L'entité niée ne doit pas apparaître comme diagnostic
|
||||||
|
all_codes = set()
|
||||||
|
if dossier.diagnostic_principal:
|
||||||
|
all_codes.add(dossier.diagnostic_principal.cim10_suggestion)
|
||||||
|
for d in dossier.diagnostics_associes:
|
||||||
|
all_codes.add(d.cim10_suggestion)
|
||||||
|
assert "R50.9" not in all_codes
|
||||||
|
|
||||||
|
def test_drug_atc_enrichment(self):
|
||||||
|
"""Vérifie que les codes ATC edsnlp sont ajoutés aux traitements."""
|
||||||
|
from src.medical.edsnlp_pipeline import EdsnlpResult, DrugEntity
|
||||||
|
|
||||||
|
parsed = {
|
||||||
|
"type": "crh",
|
||||||
|
"patient": {"sexe": "M"},
|
||||||
|
"sejour": {},
|
||||||
|
"diagnostics": [],
|
||||||
|
}
|
||||||
|
text = "TTT de sortie :\nParacétamol 1g matin\n\nDevenir : retour."
|
||||||
|
|
||||||
|
edsnlp_result = EdsnlpResult(
|
||||||
|
drug_entities=[
|
||||||
|
DrugEntity(texte="Paracétamol", code_atc="N02BE01", negation=False),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
dossier = extract_medical_info(parsed, text, edsnlp_result=edsnlp_result)
|
||||||
|
assert len(dossier.traitements_sortie) >= 1
|
||||||
|
paracetamol = next(
|
||||||
|
(t for t in dossier.traitements_sortie if "Paracétamol" in t.medicament), None
|
||||||
|
)
|
||||||
|
assert paracetamol is not None
|
||||||
|
assert paracetamol.code_atc == "N02BE01"
|
||||||
|
|
||||||
|
def test_edsnlp_negation_for_complications(self):
|
||||||
|
"""Vérifie que la négation edsnlp filtre les complications."""
|
||||||
|
from src.medical.edsnlp_pipeline import EdsnlpResult, CIM10Entity
|
||||||
|
|
||||||
|
parsed = {
|
||||||
|
"type": "crh",
|
||||||
|
"patient": {"sexe": "M"},
|
||||||
|
"sejour": {},
|
||||||
|
"diagnostics": [],
|
||||||
|
}
|
||||||
|
text = "Pas de fièvre ni d'infection. Bonne évolution."
|
||||||
|
|
||||||
|
edsnlp_result = EdsnlpResult(
|
||||||
|
cim10_entities=[
|
||||||
|
CIM10Entity(texte="fièvre", code="R50.9", negation=True),
|
||||||
|
CIM10Entity(texte="infection", code="A49.9", negation=True),
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
|
dossier = extract_medical_info(parsed, text, edsnlp_result=edsnlp_result)
|
||||||
|
# Fièvre et infection sont niées, ne doivent pas apparaître dans complications
|
||||||
|
complication_terms = [c.lower() for c in dossier.complications]
|
||||||
|
assert "fièvre" not in complication_terms
|
||||||
|
assert "infection" not in complication_terms
|
||||||
Reference in New Issue
Block a user