Files
anonymisation/anonymizer_core_refactored_onnx.py
Domi31tls 7a2af5c905 feat(phase2): Détection établissements par Aho-Corasick sur 108K noms FINESS
- Nouveau script build_finess_gazetteers.py : extraction noms distinctifs, villes, numéros depuis CSV open data
- Automate Aho-Corasick (pyahocorasick) pour matching multi-pattern en ~1.7ms/page
- 108K patterns indexés (noms composés >= 8 chars, mots uniques >= 10 chars)
- Blacklist mots génériques (clinique, pharmacie, etc.) et stop words médicaux
- Normalisation position-preserving (sans accents, même longueur)
- Construction lazy de l'AC (après chargement des stop words)
- Intégration dans _mask_line_by_regex et selective_rescan
- Nouveau gazetteer villes_finess.txt (11,660 villes)
- Résultats : "Girandières" → masqué, "Côte Basque" → masqué, 0 FP sur termes médicaux courants

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-09 22:56:43 +01:00

3119 lines
143 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement)
------------------------------------------------------------------------
- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx)
- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML
- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES]
- Redaction PDF (vector/raster) via PyMuPDF
- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif
Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime
"""
from __future__ import annotations
import io
import json
import logging
import os
import re
from concurrent.futures import ProcessPoolExecutor
log = logging.getLogger(__name__)
from dataclasses import dataclass, field
from pathlib import Path
from typing import List, Dict, Tuple, Optional, Any
# {page_idx: [(word_text, x0_norm, y0_norm, x1_norm, y1_norm), ...]}
# Coordonnées normalisées 0→1 (format natif docTR word.geometry)
OcrWordMap = Dict[int, List[Tuple[str, float, float, float, float]]]
import pdfplumber
from pdfminer.high_level import extract_text as pdfminer_extract_text
from pdfminer.layout import LAParams
from PIL import Image, ImageDraw
try:
import fitz # PyMuPDF
except Exception:
fitz = None
try:
import yaml # PyYAML for dictionaries
except Exception:
yaml = None
try:
from doctr.models import ocr_predictor as _doctr_ocr_predictor
_DOCTR_AVAILABLE = True
except Exception:
_doctr_ocr_predictor = None # type: ignore
_DOCTR_AVAILABLE = False
try:
from detectors.hospital_filter import HospitalFilter
_HOSPITAL_FILTER_AVAILABLE = True
except Exception:
_HOSPITAL_FILTER_AVAILABLE = False
HospitalFilter = None # type: ignore
# NER manager (facultatif)
try:
from ner_manager_onnx import NerModelManager, NerThresholds
except Exception:
NerModelManager = None # type: ignore
NerThresholds = None # type: ignore
# EDS-Pseudo manager (facultatif)
try:
from eds_pseudo_manager import EdsPseudoManager
except Exception:
EdsPseudoManager = None # type: ignore
# VLM manager (facultatif)
try:
from vlm_manager import VlmManager
except Exception:
VlmManager = None # type: ignore
def _load_edsnlp_drug_names() -> set:
"""Charge les noms de médicaments mono-mot depuis edsnlp/resources/drugs.json.
Retourne un set lowercase. Fallback silencieux si edsnlp absent."""
try:
import edsnlp as _edsnlp
drugs_path = _edsnlp.BASE_DIR / "resources" / "drugs.json"
if not drugs_path.exists():
return set()
import json as _json
data = _json.loads(drugs_path.read_text(encoding="utf-8"))
result = set()
for _code, names in data.items():
for name in names:
if " " not in name and len(name) >= 4:
result.add(name.lower())
return result
except Exception:
return set()
def _load_bdpm_medication_names() -> set:
"""Charge les noms de médicaments depuis la base BDPM (data/bdpm/medication_names.txt).
Retourne un set lowercase. ~5700 noms commerciaux et DCI."""
bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medication_names.txt"
if not bdpm_path.exists():
return set()
try:
names = set()
for line in bdpm_path.read_text(encoding="utf-8").splitlines():
w = line.strip()
if w and len(w) >= 3:
names.add(w.lower())
return names
except Exception:
return set()
# ----------------- Gazetteers INSEE (prénoms + communes) -----------------
_INSEE_PRENOMS: set = set()
_INSEE_COMMUNES: set = set()
def _load_insee_gazetteers():
"""Charge les gazetteers INSEE (prénoms français + communes)."""
global _INSEE_PRENOMS, _INSEE_COMMUNES
data_dir = Path(__file__).parent / "data" / "insee"
# Prénoms (lowercase, >= 3 chars)
prenoms_path = data_dir / "prenoms_france.txt"
if prenoms_path.exists():
try:
_INSEE_PRENOMS = {
line.strip().lower() for line in prenoms_path.read_text(encoding="utf-8").splitlines()
if line.strip() and len(line.strip()) >= 3
}
log.info(f"Gazetteers INSEE prénoms: {len(_INSEE_PRENOMS)} entrées")
except Exception as e:
log.warning(f"Erreur chargement prénoms INSEE: {e}")
# Communes (uppercase, >= 3 chars)
communes_path = data_dir / "communes_france.txt"
if communes_path.exists():
try:
_INSEE_COMMUNES = {
line.strip().upper() for line in communes_path.read_text(encoding="utf-8").splitlines()
if line.strip() and len(line.strip()) >= 3
}
log.info(f"Gazetteers INSEE communes: {len(_INSEE_COMMUNES)} entrées")
except Exception as e:
log.warning(f"Erreur chargement communes INSEE: {e}")
_load_insee_gazetteers()
# ----------------- Gazetteer FINESS (établissements de santé) -----------------
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres
_FINESS_ETAB_NAMES: set = set() # noms d'établissements (lowercase)
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
_FINESS_VILLES: set = set() # villes FINESS (uppercase)
_FINESS_AC = None # Automate Aho-Corasick pour noms distinctifs
try:
import ahocorasick as _ahocorasick
_AHO_AVAILABLE = True
except ImportError:
_ahocorasick = None
_AHO_AVAILABLE = False
def _normalize_for_matching(s: str) -> str:
"""Normalise pour matching gazetteer : lowercase, sans accents, espaces collapsés."""
import unicodedata
s = s.lower().strip()
s = unicodedata.normalize("NFD", s)
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
s = re.sub(r"\s+", " ", s).strip()
return s
def _load_finess_gazetteers():
"""Charge les gazetteers FINESS (établissements, numéros, téléphones, villes, Aho-Corasick)."""
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
data_dir = Path(__file__).parent / "data" / "finess"
# Numéros FINESS
finess_path = data_dir / "finess_numbers.txt"
if finess_path.exists():
try:
_FINESS_NUMBERS = {
line.strip() for line in finess_path.read_text(encoding="utf-8").splitlines()
if line.strip()
}
log.info(f"Gazetteer FINESS numéros: {len(_FINESS_NUMBERS)} entrées")
except Exception as e:
log.warning(f"Erreur chargement FINESS numéros: {e}")
# Noms d'établissements complets (pour debug/référence)
noms_path = data_dir / "etablissements_noms.txt"
if noms_path.exists():
try:
_FINESS_ETAB_NAMES = {
line.strip().lower() for line in noms_path.read_text(encoding="utf-8").splitlines()
if line.strip() and len(line.strip()) >= 6
}
log.info(f"Gazetteer FINESS noms: {len(_FINESS_ETAB_NAMES)} entrées")
except Exception as e:
log.warning(f"Erreur chargement FINESS noms: {e}")
# Noms distinctifs : chargement différé (Aho-Corasick construit au premier appel,
# car _MEDICAL_STOP_WORDS_SET n'est pas encore défini à ce stade du module)
# Villes FINESS
villes_path = data_dir / "villes_finess.txt"
if villes_path.exists():
try:
_FINESS_VILLES = {
line.strip() for line in villes_path.read_text(encoding="utf-8").splitlines()
if line.strip() and len(line.strip()) >= 3
}
log.info(f"Gazetteer FINESS villes: {len(_FINESS_VILLES)} entrées")
except Exception as e:
log.warning(f"Erreur chargement FINESS villes: {e}")
# Téléphones (pour validation)
tel_path = data_dir / "telephones.txt"
if tel_path.exists():
try:
_FINESS_TELEPHONES = {
line.strip() for line in tel_path.read_text(encoding="utf-8").splitlines()
if line.strip()
}
log.info(f"Gazetteer FINESS téléphones: {len(_FINESS_TELEPHONES)} entrées")
except Exception as e:
log.warning(f"Erreur chargement FINESS téléphones: {e}")
_load_finess_gazetteers()
# ----------------- Whitelists Médicales -----------------
_MEDICAL_STRUCTURAL_TERMS = set()
_MEDICATION_WHITELIST = set()
def load_medical_whitelists():
"""Charge les whitelists médicales (termes structurels + médicaments)."""
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
# 1. Charger les termes médicaux structurels
config_path = Path("config/medical_terms_whitelist.yml")
if config_path.exists() and yaml:
try:
with open(config_path, 'r', encoding='utf-8') as f:
data = yaml.safe_load(f)
terms = data.get('medical_structural_terms', [])
_MEDICAL_STRUCTURAL_TERMS = {t.lower() for t in terms}
log.info(f"Whitelist termes médicaux chargée: {len(_MEDICAL_STRUCTURAL_TERMS)} termes")
except Exception as e:
log.warning(f"Erreur chargement whitelist médicale: {e}")
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
# Ajouter médicaments manquants
additional_meds = {
"idacio", "salazopyrine", "infliximab", "apranax",
"ketoprofene", "prevenar", "pneumovax", "bétadine"
}
_MEDICATION_WHITELIST.update(additional_meds)
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
# Charger les whitelists au démarrage du module
load_medical_whitelists()
# ----------------- Defaults & Config -----------------
DEFAULTS_CFG = {
"version": 1,
"encoding": "utf-8",
"normalization": "NFKC",
"whitelist": {
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
"org_gpe_keep": False,
},
"blacklist": {
"force_mask_terms": [],
"force_mask_regex": [],
},
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
"regex_overrides": [
{
"name": "OGC_court",
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
"placeholder": "[OGC]",
"flags": ["IGNORECASE"],
}
],
"flags": {
"case_insensitive": True,
"unicode_word_boundaries": True,
"regex_engine": "python",
},
}
PLACEHOLDERS = {
"EMAIL": "[EMAIL]",
"TEL": "[TEL]",
"IBAN": "[IBAN]",
"NIR": "[NIR]",
"IPP": "[IPP]",
"FINESS": "[FINESS]",
"OGC": "[OGC]",
"NOM": "[NOM]",
"VILLE": "[VILLE]",
"ETAB": "[ETABLISSEMENT]",
"MASK": "[MASK]",
"DATE": "[DATE]",
"DATE_NAISSANCE": "[DATE_NAISSANCE]",
"ADRESSE": "[ADRESSE]",
"CODE_POSTAL": "[CODE_POSTAL]",
"AGE": "[AGE]",
"DOSSIER": "[DOSSIER]",
"NDA": "[NDA]",
"EPISODE": "[EPISODE]",
"RPPS": "[RPPS]",
}
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
# Baseline regex
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
RE_NIR = re.compile(
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
re.IGNORECASE,
)
def validate_nir(nir_raw: str) -> bool:
"""Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B)."""
digits_only = re.sub(r"\s+", "", nir_raw)
if len(digits_only) < 15:
return False
body_str = digits_only[:13]
key_str = digits_only[13:15]
# Corse : 2A → 19, 2B → 18 (pour le calcul)
body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18")
try:
body_int = int(body_str_calc)
key_int = int(key_str)
except ValueError:
return False
return key_int == (97 - (body_int % 97))
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes
_MEDICAL_STOP_WORDS_SET = {
# Mots français courants (déterminants, prépositions, adverbes, etc.)
"pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous",
"mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par",
"les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces",
"cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant",
"puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques",
"mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours",
"semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau",
"franche", "légère", "quelque", "depuis", "comme", "encore", "votre",
"date", "note", "notes", "nom", "heure", "matin", "soir", "midi",
"signé", "réalisé", "courrier", "cabinet", "rue",
# Verbes / participes courants
"remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
"prescrit", "prescrite", "présente", "présent", "absente", "absent",
"reprise", "introduction", "arrêt", "relais",
# Titres / rôles hospitaliers
"chef", "assistant", "assistante", "praticien", "praticienne",
"docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
"spécialiste", "contractuel", "contractuelle", "titulaire",
"confrère", "consoeur", "coordonnateur", "coordonnatrice",
"médecin", "médical", "infirmier", "infirmière",
"praticiens", "patient", "patiente",
# Structure hospitalière
"service", "pôle", "clinique", "consultation", "secrétariat",
"hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
# Villes / géographie (pas des noms de personnes)
"bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
"toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
"basque", "basques", "sud", "côte",
# Médicaments génériques et spécialités (DCI + noms commerciaux)
"colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
"methotrexate", "eplerenone", "speciafoldine", "prednisone",
"corticoïdes", "cortisone",
"paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
"lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
"insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
"ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
"morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
"seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
"bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
"quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
"opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
"laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
"terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
"lansoprazole", "perindopril", "sodium", "velmetia",
"doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
"augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
"ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
"irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
"spironolactone", "furosemide", "lasilix", "aldactone",
"tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
"xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
"plavix", "clopidogrel", "ticagrelor", "brilique",
"ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
"salbutamol", "tiotropium", "budesonide", "beclometasone",
"oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
"nubain", "nalbuphine", "nefopam", "acupan", "profenid",
"ibuprofene", "diclofenac", "naproxene", "celecoxib",
"gabapentine", "pregabaline", "lyrica", "neurontin",
"amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
"paroxetine", "escitalopram", "citalopram", "mirtazapine",
"olanzapine", "risperidone", "aripiprazole", "haloperidol",
"loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
"clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
"stilnox", "zolpidem", "imovane",
"levothyroxine", "metformine", "glimepiride", "sitagliptine",
"januvia", "jardiance", "empagliflozine", "dapagliflozine",
"ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
"heparine", "enoxaparine", "tinzaparine", "innohep",
"warfarine", "coumadine", "fluindione", "previscan",
"ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
"vancomycine", "gentamicine", "tazocilline", "piperacilline",
"meropenem", "imipenem", "clindamycine", "doxycycline",
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
"polyionique", "propranolol", "apidra", "solostar",
# Noms et suffixes laboratoires pharmaceutiques
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
"evolugen", "alter", "zydus", "medisol", "substipharm",
"sdz", "bgr", "egt", "rnb",
# Formes galéniques / voies d'administration
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
"unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
"orodisp", "capsule", "patch", "suppositoire", "gouttes",
# Termes de prescription / pharmacie
"prescription", "prescriptions", "dose", "fréquence", "statut",
"technique", "capteur", "bandelettes", "glycemiques", "glycemique",
"lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
"glycemie", "capillaire", "hgt",
# Termes médicaux / cliniques
"myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
"dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
"antécédents", "examen", "bilan", "résultats", "analyse",
"interne", "externe", "médecine", "chirurgie", "rhumatologie",
"dermatologie", "immunologie", "cardiologie", "pneumologie",
"neurologie", "gynécologie", "radiologie", "sénologie",
"douleur", "douleurs", "douloureux", "musculaire", "musculaires",
"thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
"normale", "normaux", "habituelle", "habituelles",
"synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
"pelvien", "diabétique", "sommeil", "régime", "diet",
"desinfection", "environnement", "identification", "bracelet",
"toilettes", "accompagner", "installer", "transfusion",
"signes", "vitaux", "alimentaire", "avis", "zone",
"calcémie",
# Abréviations médicales
"irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
"bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
"saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
"poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
"qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
"vit", "zen",
"scanner", "radio", "écho", "échographie",
# Spécialités médicales (éviter faux positifs NOM)
"hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
"proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
"cardiologue", "néphrologue", "urologue", "gériatre",
"hépatologue", "endocrinologue", "stomatologue",
# Termes médicaux / titres fréquemment détectés comme NOM par le NER
"supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
"suivi", "sortie", "emog", "ophtalmo",
# Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
"eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
"lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
"depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
"rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
"pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
# Termes médicaux / soins / actes détectés comme NOM
"partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
"diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
"ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
"diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
# Mots courants français détectés comme NOM dans les trackare
"toilette", "repas", "poche", "installation", "education", "éducation",
"refection", "réfection", "complete", "complète", "regime", "régime",
"normal", "traité", "traite", "arrêté", "arrete", "volume",
"commentaires", "france", "covid", "framboise", "epoux", "époux",
# Abréviations médicales courtes (3-4 chars) détectées comme NOM
"ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
"mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
"amox", "endoc", "microg", "item", "pyélo", "néphro",
# En-têtes de colonnes / mots structurels trackare
"observations", "observation", "commentaires", "commentaire",
"surveillance", "température", "temperature", "glycémie", "glycemie",
"diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
"saturation", "fréquence", "frequence", "respiratoire", "douleur",
"alertes", "alerte", "antécédents", "antecedents", "habitus",
"allergies", "prescriptions", "prescription", "administration",
"catégorie", "categorie", "expiration", "message",
"destination", "diagnostique", "diagnostiques",
"date", "note", "nom", "heure", "type", "code", "etat",
"comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
# Médicaments supplémentaires détectés dans les trackare
"depakote", "versatis", "humalog", "forxiga", "durogesic",
"montelukast", "rosuvastatine",
# Abréviations pharma courtes
"cpr", "sol", "bic", "agt", "poche", "inhal",
# Termes chirurgicaux/cliniques FP
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
"gauche", "droit", "droite", "face", "profil",
# Faux positifs EDS supplémentaires
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
"10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
"actiskenan", "simvastatine", "forlax",
# Mots temporels / contextuels détectés comme EDS_HOPITAL
"semaine", "jour", "matin", "soir", "nuit", "midi",
# Mots clés de contexte document
"compétences", "maladies", "inflammatoires", "systémiques", "rares",
"fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
"haute", "maison", "aide", "rpps", "poste", "fonct",
"sante", "santé", "etxe", "ttipi", "gastro", "concha",
"endoscopie", "endoscopique", "fibroscopie",
"indication", "conclusion", "technique", "anesthésie",
"digestif", "digestive", "digestives", "nutritive",
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
"paracetamol", "paracétamol", "unité", "unite",
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
"glyc", "glycosurie", "vider", "forte",
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
"responsable", "autre", "autres", "autonome", "autonomes",
"préparations", "preparations", "prévenir", "prevenir",
"acétylsalicylique", "acetylsalicylique", "angio",
"desc", "diu", "barreau",
"haitz", "alde",
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
"alimentation", "augmentation", "amelioration", "amélioration",
"biliaire", "biliaires", "bili", "voies", "voie",
"apyrexie", "apyréxie", "apyrétique", "apyretique",
"clavulanique", "mecillinam", "sulfamides", "sulfamide",
"tazobactam", "temocilline", "ecoflac", "furanes", "furane",
"exilar", "lipruzet", "mopral",
"sensible", "sensibles", "dossier", "dossiers",
"entero", "entéro", "medecine", "bio",
"aviation", "contention", "isolement",
"elimination", "élimination", "infectieux",
"hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
"cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
"appendicectomie", "néoplasie", "neoplasie",
"ovarienne", "prandial", "fébrile", "febrile",
"eupnéique", "eupneique", "normocarde", "normotendue",
"variable", "dosage", "posologie",
# Abréviations diététiques/soins trackare
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
# FP audit OGC 17 CRH
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
"saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
# Spécialités/services récurrents comme FP NOM
"cancérologie", "cancerologie", "réanimation", "reanimation",
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
"ophtalmologie", "stomatologie", "allergologie",
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
"orthopédie", "orthopedie", "traumatologie",
"palliatifs", "palliative", "palliatif",
"addictologie", "alcoologie", "tabacologie",
# FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
"discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
"evendol", "relais", "repas", "poursuite", "indication",
# FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
"eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
"thermie", "alim", "alimentation", "admin",
# Médicaments/tests labo capturés par patterns soignants
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
"ciprofloxacine", "lavement", "desinfection", "désinfection",
"avaler", "rachis", "lombaire", "thoraco-lombaire",
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
"faible", "fort", "forte",
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
"entree", "entrée", "continu", "continue",
"morphine", "claforan", "skenan", "actiskenan",
# Fragments de noms de médicaments (pdfplumber split)
"sium", "pegic", "fenid", "profenid",
# Catégories cliniques Trackare (en-têtes de section masqués à tort)
"respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
"hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
"transit", "anemie", "anémie", "constantes", "examen",
"post-op", "postop", "pré-op", "preop", "chimio", "elim",
"toilette", "sommeil", "hypota", "hypotension", "spo2",
"urine", "urines", "sng",
"rénale", "renale", "rénal", "renal", "cardiaque",
# Termes structurels trackare
"transmissions", "transmission", "releve", "relevé",
"objectif", "objectifs", "evaluation", "évaluation",
"planification", "planifié", "planifiee",
# ── FP détectés automatiquement par audit_fp_detector.py ──
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
"vernis", "vessie", "vrac",
# Lot 2 : termes médicaux (préfixes/suffixes)
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
"cétonurie", "cetonurie", "depilation", "dépilation",
"folique", "gastroentérologue", "gastroenterologue",
"microgrammes", "nalidixique", "naso-gastrique",
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
"cyto", "plaie-colle", "bionolyte",
# Lot 1 (103 tokens, confiance >= 0.5) ──
# Anatomie / clinique
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
# Pathologies / symptômes
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
# Médicaments / matériel médical
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
"oxygène", "pansement", "vitamine",
# Biologie / examens
"biochimie", "biologie", "fer",
# Actions / états cliniques
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
"intervention", "position", "rappel", "relation", "retour", "réalisation",
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
"urgent", "validation",
# Mots courants / contextuels
"angle", "bille", "boisson", "bureau", "cases", "circuit",
"concubin", "confortable", "demain", "densité", "dernière",
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
"personne", "premier", "quartier", "retraite", "route", "rés",
"trouve", "verrouillé", "villa", "étage",
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
"maternité", "orale", "sachet", "absence",
# FP audit 30 fichiers Phase 2 (mars 2026)
"bouffee", "bouffée", "discontinue", "respimat", "lyoc",
"probnp", "pro-bnp", "nt-probnp",
"bpco", "colle", "gsc", "masse",
"selle", "selles",
}
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
_MEDICAL_STOP_WORDS = (
r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
)
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_PERSON_CONTEXT = re.compile(
r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
r"|\bNom[ \t]*:[ \t]*"
r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
r")[ \t]+)"
rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
)
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
RE_DR_COMMA_LIST = re.compile(
r"(?:Dr\.?|DR\.?|Docteur)\s+"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+"
r"(?:\s*,\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+)+",
re.IGNORECASE,
)
# Token nom : mot commençant par une majuscule d'au moins 3 lettres
_NAME_TOKEN_RE = re.compile(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']{2,}")
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
# --- Extraction globale de noms depuis champs structurés ---
_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
RE_EXTRACT_PATIENT = re.compile(
r"Patient\(?e?\)?\s*:\s*"
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)"
r"(?=\s+Né|\s+né|\s+N°|\s*$)",
re.MULTILINE,
)
# Champs d'identité structurés (documents trackare / DPI)
RE_EXTRACT_NOM_NAISSANCE = re.compile(
r"Nom\s+de\s+naissance\s*:\s*"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+IPP|\s*$)",
re.MULTILINE,
)
RE_EXTRACT_NOM_PRENOM = re.compile(
r"Nom\s+et\s+Pr[ée]nom\s*:\s*"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+Date|\s+Né|\s*$)",
re.MULTILINE,
)
RE_EXTRACT_LIEU_NAISSANCE = re.compile(
r"Lieu\s+de\s+naissance\s*:\s*"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
re.MULTILINE,
)
RE_EXTRACT_VILLE_RESIDENCE = re.compile(
r"Ville\s+de\s+r[ée]sidence\s*:\s*"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
re.MULTILINE,
)
# Contacts structurés : Conjoint/Concubin/Epoux/Epouse/Parent + NOM PRENOM
RE_EXTRACT_CONTACT = re.compile(
r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur)\s+"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
)
RE_EXTRACT_REDIGE = re.compile(
r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
RE_EXTRACT_MME_MR = re.compile(
r"(?:MME|Mme|Madame|Monsieur|Mr?\.?)\s+"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
rf"((?:{_UC_COMPOUND})(?:\s+(?:{_UC_COMPOUND}))*)",
)
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
RE_EXTRACT_DR_DEST = re.compile(
r"(?:DR\.?|Dr\.?|Docteur)[ \t]+"
+ _INITIAL_OPT +
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
RE_EXTRACT_STAFF_ROLE = re.compile(
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre[ \t]+Infirmier"
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)[ \t]*:?[ \t]*"
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
)
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
RE_EXTRACT_PR = re.compile(
r"(?:Pr\.?|Professeur)[ \t]+"
+ _INITIAL_OPT +
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
)
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
RE_EXTRACT_OPERATEUR = re.compile(
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
+ _INITIAL_OPT +
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
)
# Téléphone avec extension slash : 05.59.44.38.32/34
RE_TEL_SLASH = re.compile(
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
)
CID_PATTERN = re.compile(r"\(cid:\d+\)")
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
RE_DATE_NAISSANCE = re.compile(
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
re.IGNORECASE,
)
RE_DATE = re.compile(
r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b"
r"|"
r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b",
re.IGNORECASE,
)
RE_ADRESSE = re.compile(
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence"
r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)"
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
re.IGNORECASE,
)
RE_CODE_POSTAL = re.compile(
r"(?:(?:[Cc]ode\s*[Pp]ostal|CP)\s*[:\-]?\s*(\d{5}))"
r"|"
# 5 chiffres + nom de ville (Title Case ou MAJUSCULES), pas précédé d'un chiffre (évite RPPS)
# Exclure les unités médicales (UI, mg, ml, etc.) via negative lookahead
r"(?:(?<!\d)(\d{5})[ \t]+(?!UI\b|mg\b|ml\b|µg\b)[A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+"
r"(?:[\s\-][A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+)*"
r"(?:\s+CEDEX)?)",
)
RE_BP = re.compile(
r"(?:[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\.\-]+\s+)?BP\s+\d+",
re.IGNORECASE,
)
RE_AGE = re.compile(
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+(?:de\s+)?|(?:,\s*|\(\s*)"
r")(\d{1,3})\s*(?:ans|A)\b",
re.IGNORECASE,
)
# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
_ETAB_NAME = (r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)")
RE_ETABLISSEMENT = re.compile(
r"\b("
# Sigles longs : acceptés seuls ou avec nom
r"(?:EHPAD|SSR/USLD|SSR|USLD|HAD|CSAPA|CMPP|CMP|UGA|CHRU|CHU|HIA|CLCC|GHT|GCS)"
+ _ETAB_NAME + r"*"
r"|"
# Sigles courts (CH, CHS) : obligent un nom après pour éviter les faux positifs
r"(?:CHS|CH)" + _ETAB_NAME + r"+"
r")",
)
RE_HOPITAL_VILLE = re.compile(
r"(?<![Ee]xamen )"
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
r"|[Cc]entre\s+[Mm][ée]dical|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer|[Pp]harmacie)"
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
)
RE_SERVICE = re.compile(
r"\b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement)\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
)
RE_NUMERO_DOSSIER = re.compile(
r"(?:\bdossier|\bn°\s*dossier|\bNDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
r"|"
r"(?:\bréférence|\bréf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
re.IGNORECASE,
)
RE_EPISODE = re.compile(
r"\s*[ÉéEe]pisode\s*[:\-]?\s*([A-Za-z0-9\-]{4,})"
r"|"
r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})",
re.IGNORECASE,
)
@dataclass
class PiiHit:
page: int
kind: str
original: str
placeholder: str
bbox_hint: Optional[Tuple[float, float, float, float]] = None
@dataclass
class AnonResult:
text_out: str
tables_block: str
audit: List[PiiHit] = field(default_factory=list)
is_trackare: bool = False
# ----------------- Config loader -----------------
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
cfg = DEFAULTS_CFG.copy()
if config_path and config_path.exists() and yaml is not None:
try:
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
for k, v in user.items():
cfg[k] = v
except Exception:
pass
return cfg
# ----------------- Extraction -----------------
_doctr_model_cache = None
def _get_doctr_model():
global _doctr_model_cache
if _doctr_model_cache is None:
_doctr_model_cache = _doctr_ocr_predictor(
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True
)
return _doctr_model_cache
def _extract_page_layout_aware(page) -> str:
"""Extrait le texte d'une page PyMuPDF en gérant les layouts multi-colonnes.
Détecte si la page a un sidebar/colonne gauche parallèle à un corps droit
(typique des CRH/CRO hospitaliers). Si oui, lit chaque colonne séparément
pour éviter l'entrelacement du texte.
"""
blocks = page.get_text("blocks")
text_blocks = []
for b in blocks:
x0, y0, x1, y1, text, _block_no, block_type = b
if block_type == 0 and text.strip():
text_blocks.append((x0, y0, x1, y1, text.strip()))
if not text_blocks:
return ""
page_w = page.rect.width
page_h = page.rect.height
# --- Détection de colonnes ---
# Cherche une ligne verticale split_x qui sépare les blocs en deux groupes
# parallèles (chevauchement vertical significatif).
best_split = None
best_score = -1
for split_x in range(int(page_w * 0.15), int(page_w * 0.45), 3):
left = [b for b in text_blocks if b[2] <= split_x + 5]
right = [b for b in text_blocks if b[0] >= split_x - 5]
crossing = [b for b in text_blocks if b[0] < split_x - 5 and b[2] > split_x + 5]
if len(left) < 3 or len(right) < 3:
continue
left_span = max(b[3] for b in left) - min(b[1] for b in left)
right_span = max(b[3] for b in right) - min(b[1] for b in right)
if left_span < page_h * 0.25 or right_span < page_h * 0.25:
continue
overlap_min = max(min(b[1] for b in left), min(b[1] for b in right))
overlap_max = min(max(b[3] for b in left), max(b[3] for b in right))
if overlap_max - overlap_min < page_h * 0.15:
continue
score = len(left) + len(right) - 5 * len(crossing)
if score > best_score:
best_score = score
best_split = split_x
if best_split is not None:
left_blocks = sorted(
[b for b in text_blocks if b[2] <= best_split + 5], key=lambda b: b[1]
)
right_blocks = sorted(
[b for b in text_blocks if b[0] >= best_split - 5], key=lambda b: b[1]
)
full_width = sorted(
[b for b in text_blocks if b[0] < best_split - 5 and b[2] > best_split + 5],
key=lambda b: b[1],
)
col_start_y = min(
min((b[1] for b in left_blocks), default=page_h),
min((b[1] for b in right_blocks), default=page_h),
)
headers = [b for b in full_width if b[1] < col_start_y + 5]
footers = [b for b in full_width if b[1] >= col_start_y + 5]
parts = []
for b in headers:
parts.append(b[4])
for b in left_blocks:
parts.append(b[4])
for b in right_blocks:
parts.append(b[4])
for b in footers:
parts.append(b[4])
return "\n".join(parts)
else:
sorted_blocks = sorted(text_blocks, key=lambda b: (b[1], b[0]))
return "\n".join(b[4] for b in sorted_blocks)
def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool, OcrWordMap]:
"""Extraction texte multi-passes avec fallback OCR (docTR).
Retourne (pages_text, tables_lines, ocr_used, ocr_word_map).
Passe 1 : PyMuPDF layout-aware (blocs avec détection de colonnes)
Passe 1b: pdfplumber si PyMuPDF échoue ou donne peu de texte
Passe 2 : pdfminer si CID ou texte pauvre
Passe 3 : OCR docTR si PDF scanné (très peu de texte)
Tables : toujours extraites via pdfplumber (indépendamment du texte).
"""
pages_text: List[str] = []
tables_lines: List[List[str]] = []
ocr_used = False
# --- Tables : toujours via pdfplumber ---
with pdfplumber.open(pdf_path) as pdf:
for p in pdf.pages:
rows: List[str] = []
try:
tables = p.extract_tables()
for tbl in tables or []:
for row in tbl:
clean = [c if c is not None else "" for c in row]
rows.append("\t".join(clean).strip())
except Exception:
pass
tables_lines.append(rows)
# --- Passe 1 : PyMuPDF layout-aware (détection multi-colonnes) ---
if fitz is not None:
try:
doc = fitz.open(str(pdf_path))
pages_text = [_extract_page_layout_aware(doc[i]) for i in range(len(doc))]
doc.close()
except Exception:
pass
# --- Passe 1b : pdfplumber si PyMuPDF n'a rien donné ---
total_chars = sum(len(x or "") for x in pages_text)
if total_chars < 500:
try:
with pdfplumber.open(pdf_path) as pdf:
pp_pages = [p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or "" for p in pdf.pages]
if sum(len(x) for x in pp_pages) > total_chars:
pages_text = pp_pages
except Exception:
pass
# --- Passe 2 : pdfminer si CID ou texte pauvre ---
total_chars = sum(len(x or "") for x in pages_text)
need_fallback = total_chars < 500
if not need_fallback:
need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text)
if need_fallback:
try:
text_all = pdfminer_extract_text(
str(pdf_path),
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
)
split = [x for x in text_all.split("\f") if x]
if split and sum(len(x) for x in split) > total_chars:
pages_text = split
except Exception:
pass
# --- Passe 3 : OCR docTR si PDF scanné (très peu de texte) ---
total_chars = sum(len(x or "") for x in pages_text)
ocr_word_map: OcrWordMap = {}
if total_chars < 200 and _DOCTR_AVAILABLE and fitz is not None:
try:
model = _get_doctr_model()
doc = fitz.open(str(pdf_path))
ocr_pages: List[str] = []
import numpy as np
for i in range(len(doc)):
pix = doc[i].get_pixmap(dpi=300)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
result = model([np.array(img)])
page_text = ""
page_words: List[Tuple[str, float, float, float, float]] = []
for block in result.pages[0].blocks:
for line in block.lines:
for w in line.words:
(x0, y0), (x1, y1) = w.geometry
page_words.append((w.value, x0, y0, x1, y1))
page_text += " ".join(w.value for w in line.words) + "\n"
ocr_word_map[i] = page_words
ocr_pages.append(page_text)
doc.close()
if sum(len(p) for p in ocr_pages) > total_chars:
pages_text = ocr_pages
ocr_used = True
else:
ocr_word_map = {}
except Exception:
ocr_word_map = {}
return pages_text, tables_lines, ocr_used, ocr_word_map
# Alias pour compatibilité ascendante
def extract_text_three_passes(pdf_path: Path):
pages_text, tables_lines, _, _ = extract_text_with_fallback_ocr(pdf_path)
return pages_text, tables_lines
# ----------------- Helpers -----------------
def _compile_user_regex(pattern: str, flags_list: List[str]):
flags = 0
for f in flags_list or []:
u = f.upper()
if u == "IGNORECASE": flags |= re.IGNORECASE
if u == "MULTILINE": flags |= re.MULTILINE
if u == "DOTALL": flags |= re.DOTALL
return re.compile(pattern, flags)
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
for ov in cfg.get("regex_overrides", []) or []:
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
flags_list = ov.get("flags", [])
try:
rx = _compile_user_regex(pattern, flags_list)
except Exception:
continue
def _rep(m: re.Match):
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
return placeholder
line = rx.sub(_rep, line)
# force-mask literals
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
if not term: continue
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
if word_rx.search(line):
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
# force-mask regex
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
try:
rx = re.compile(pat, re.IGNORECASE)
except Exception:
continue
if rx.search(line):
audit.append(PiiHit(page_idx, "force_regex", pat, PLACEHOLDERS["MASK"]))
line = rx.sub(PLACEHOLDERS["MASK"], line)
return line
RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
m = RE_FINESS.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
# Détection FINESS par gazetteer : nombre 9 chiffres qui matche un vrai numéro FINESS
if _FINESS_NUMBERS:
for m9 in RE_BARE_9DIGITS.finditer(line):
if m9.group(1) in _FINESS_NUMBERS:
val = m9.group(1)
audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
line = line.replace(val, PLACEHOLDERS["FINESS"], 1)
return line
m = RE_OGC.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
m = RE_IPP.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
m = RE_CSULT.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
m = RE_RPPS.search(line)
if m:
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
return RE_RPPS.sub(lambda _: f"RPPS : {PLACEHOLDERS['RPPS']}", line)
return line
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
# user overrides & force-masks d'abord
line = _apply_overrides(line, audit, page_idx, cfg)
# EMAIL
def _repl_email(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
return PLACEHOLDERS["EMAIL"]
line = RE_EMAIL.sub(_repl_email, line)
# URL hospitalière (www.ch-xxx.fr, www.hopital-xxx.fr, etc.)
_re_url_hospital = re.compile(r"(?:https?://)?www\.[a-z0-9\-]+\.(?:fr|com|org)(?:/[^\s]*)?", re.IGNORECASE)
m_url = _re_url_hospital.search(line)
if m_url:
audit.append(PiiHit(page_idx, "ETAB", m_url.group(0), PLACEHOLDERS["ETAB"]))
line = line[:m_url.start()] + PLACEHOLDERS["ETAB"] + line[m_url.end():]
# TEL
def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
line = RE_TEL.sub(_repl_tel, line)
line = RE_TEL_COMPACT.sub(_repl_tel, line)
# IBAN
def _repl_iban(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
return PLACEHOLDERS["IBAN"]
line = RE_IBAN.sub(_repl_iban, line)
# NIR (avec validation clé modulo 97)
def _repl_nir(m: re.Match) -> str:
raw = m.group(0)
if not validate_nir(raw):
return raw # faux positif, on ne masque pas
audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"]))
return PLACEHOLDERS["NIR"]
line = RE_NIR.sub(_repl_nir, line)
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
def _repl_date_naissance(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"]))
return PLACEHOLDERS["DATE_NAISSANCE"]
line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line)
# DATE générique — désactivé : seules les dates de naissance sont masquées
# def _repl_date(m: re.Match) -> str:
# audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"]))
# return PLACEHOLDERS["DATE"]
# line = RE_DATE.sub(_repl_date, line)
# ADRESSE
def _repl_adresse(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
return PLACEHOLDERS["ADRESSE"]
line = RE_ADRESSE.sub(_repl_adresse, line)
# BOITE POSTALE (BP)
def _repl_bp(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
return PLACEHOLDERS["ADRESSE"]
line = RE_BP.sub(_repl_bp, line)
# CODE_POSTAL
def _repl_code_postal(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
return PLACEHOLDERS["CODE_POSTAL"]
line = RE_CODE_POSTAL.sub(_repl_code_postal, line)
# AGE
def _repl_age(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"]))
return PLACEHOLDERS["AGE"]
line = RE_AGE.sub(_repl_age, line)
# NUMERO DOSSIER / NDA
def _repl_dossier(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"]))
return PLACEHOLDERS["DOSSIER"]
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
# N° EPISODE / Episode N. (pieds de page Trackare)
def _repl_episode(m: re.Match) -> str:
val = m.group(1) or m.group(2) or m.group(0)
audit.append(PiiHit(page_idx, "EPISODE", val, PLACEHOLDERS["EPISODE"]))
# Reconstruire le remplacement en gardant le préfixe et masquant la valeur
full = m.group(0)
return full[:full.find(val)] + PLACEHOLDERS["EPISODE"]
line = RE_EPISODE.sub(_repl_episode, line)
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
def _repl_etab(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"]))
return PLACEHOLDERS["ETAB"]
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
# Établissements par gazetteer Aho-Corasick FINESS (116K noms distinctifs)
if _FINESS_AC is not None:
old_line = line
line = _mask_finess_establishments(line)
if line != old_line:
# Enregistrer les hits dans l'audit
# (on ne peut pas facilement savoir quels noms ont matché,
# mais on log le fait qu'un match gazetteer a eu lieu)
audit.append(PiiHit(page_idx, "ETAB_FINESS", "gazetteer", PLACEHOLDERS["ETAB"]))
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
def _repl_service(m: re.Match) -> str:
full_match = m.group(0)
# Vérifier si c'est un terme structurel à préserver
if full_match.lower() in _MEDICAL_STRUCTURAL_TERMS:
return full_match
# Vérifier le contexte avant (Chef de, Praticien, etc.)
start_pos = m.start()
context_before = line[max(0, start_pos-25):start_pos].lower()
# Patterns à préserver
preserve_patterns = ['chef de', 'praticien', 'ancien', 'assistant', 'médecin', 'interne']
if any(pattern in context_before for pattern in preserve_patterns):
return full_match
audit.append(PiiHit(page_idx, "ETAB", full_match, PLACEHOLDERS["MASK"]))
return PLACEHOLDERS["MASK"]
line = RE_SERVICE.sub(_repl_service, line)
# Ville en en-tête de courrier : "Bayonne, le 12/03/2024" → masquer la ville
# Le contexte "Mot, le [date]" est fiable (virgule obligatoire)
# Autorise les mots de liaison minuscules (de, du, la, sur, en, lès)
_re_ville_date = re.compile(
r"^(\s*)"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç\-]+"
r"(?:\s+(?:de|du|la|sur|en|lès|les|l['']\s*)?"
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)*)"
r"(\s*,\s+le\s+\d{1,2})",
re.MULTILINE,
)
def _repl_ville_date(m: re.Match) -> str:
ville = m.group(2).strip()
audit.append(PiiHit(page_idx, "VILLE", ville, PLACEHOLDERS["VILLE"]))
return m.group(1) + PLACEHOLDERS["VILLE"] + m.group(3)
line = _re_ville_date.sub(_repl_ville_date, line)
# Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
_re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)(\S.+)")
def _repl_lieu(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"]))
return m.group(1) + PLACEHOLDERS["VILLE"]
line = _re_lieu.sub(_repl_lieu, line)
_re_ville_res = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
def _repl_ville_res(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"]))
return m.group(1) + PLACEHOLDERS["VILLE"]
line = _re_ville_res.sub(_repl_ville_res, line)
# PERSON uppercase avec contexte, whitelist/acronymes courts
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
_stop_rx = re.compile(_MEDICAL_STOP_WORDS, re.IGNORECASE)
def _clean_name_span(span: str) -> str:
"""Tronque le span au premier mot médical/stop word."""
tokens = span.split()
clean = []
for t in tokens:
if _stop_rx.fullmatch(t):
break
clean.append(t)
return " ".join(clean).strip(" .-'")
def _repl_person_ctx(m: re.Match) -> str:
span = m.group(1).strip(); raw = m.group(0)
if span in wl_sections or raw in wl_phrases: return raw
# Tronquer avant les mots médicaux
cleaned = _clean_name_span(span)
if not cleaned:
return raw
tokens = [t for t in cleaned.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3: return raw
audit.append(PiiHit(page_idx, "NOM", cleaned, PLACEHOLDERS["NOM"]))
return raw.replace(cleaned, PLACEHOLDERS["NOM"])
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
# Passe supplémentaire : noms dans des listes virgulées après "Dr"
# ex: "le Dr DUVAL, MACHELART, LAZARO" → masquer chaque nom
for m in RE_DR_COMMA_LIST.finditer(line):
fragment = m.group(0)
# Extraire les segments séparés par des virgules (sauf le premier qui inclut "Dr")
parts = [p.strip() for p in fragment.split(",")]
for part in parts:
# Extraire les tokens nom de chaque segment
for tok in _NAME_TOKEN_RE.findall(part):
if tok in wl_sections or len(tok) <= 2:
continue
if _stop_rx.fullmatch(tok):
continue
if tok not in line:
continue
# Vérifier qu'il n'est pas déjà masqué
if f"[{tok}]" in line or tok in {v for v in PLACEHOLDERS.values()}:
continue
audit.append(PiiHit(page_idx, "NOM", tok, PLACEHOLDERS["NOM"]))
line = re.sub(rf"\b{re.escape(tok)}\b", PLACEHOLDERS["NOM"], line)
return line
def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
"""Masque les TEL et EMAIL même dans la partie 'clé' d'une ligne clé:valeur."""
def _repl_tel(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
key = RE_TEL_SLASH.sub(_repl_tel, key)
key = RE_TEL.sub(_repl_tel, key)
key = RE_TEL_COMPACT.sub(_repl_tel, key)
def _repl_email(m: re.Match) -> str:
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
return PLACEHOLDERS["EMAIL"]
key = RE_EMAIL.sub(_repl_email, key)
return key
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
line = _mask_admin_label(line, audit, page_idx)
parts = SPLITTER.split(line, maxsplit=1)
if len(parts) == 2:
key, value = parts
masked_key = _mask_critical_in_key(key, audit, page_idx)
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
return f"{masked_key.strip()} : {masked_val.strip()}"
else:
return _mask_line_by_regex(line, audit, page_idx, cfg)
# ----------------- Extraction globale de noms -----------------
def _is_trackare_document(text: str) -> bool:
"""Détecte si le document est un export Trackare/TrakCare (DPI structuré)."""
markers = ["Détails des patients", "Nom de naissance", "Dossier Patient"]
t = text[:3000].lower()
return sum(1 for m in markers if m.lower() in t) >= 2
def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
"""Parse les champs structurés d'un document Trackare pour extraire les PII.
Retourne (name_tokens, pii_hits) avec les noms à masquer et les hits additionnels."""
names: set = set()
hits: List[PiiHit] = []
force_names: set = set() # noms issus de contextes structurés (DR., Signé, etc.) → bypass stop words
def _add_name(s: str):
s = s.strip()
parts = s.split()
for tok in parts:
tok = tok.strip(" .-'(),")
if len(tok) >= 2 and tok[0].isupper():
names.add(tok)
# Garder aussi le nom composé complet (DI LULLO, LE MOIGNE, etc.)
if len(parts) >= 2:
compound = " ".join(t.strip(" .-'(),") for t in parts if len(t.strip(" .-'(),")) >= 2)
if len(compound) >= 5:
names.add(compound)
# Termes non-noms fréquents dans les contextes Signé/DR./Note d'évolution
_FORCE_EXCLUDE = _MEDICATION_WHITELIST | {
"elimination", "élimination", "forte", "intraveineuse", "lavage",
"sonde", "normal", "réalisé", "realise", "germes", "bbm", "arw",
"orale", "sachet", "injectable", "comprime", "comprimé", "gelule",
"gélule", "seringue", "poche", "flacon", "ampoule", "preremplie",
"préremplie",
}
def _add_name_force(tok: str):
"""Ajoute un nom depuis un contexte structuré fiable (DR., Signé direct, Note d'évolution).
Bypass les stop words généraux mais filtre médicaments et termes de soins courants."""
tok = tok.strip(" .-'(),")
if len(tok) < 3 or not tok[0].isupper():
return
if tok.lower() in _FORCE_EXCLUDE:
return
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
return
names.add(tok)
force_names.add(tok)
# --- Identité patient ---
# Nom de naissance: DIEGO (peut apparaître 2x : en-tête + récap tabulaire)
for m in re.finditer(r"Nom\s+de\s+naissance\s*:\s*(.+?)(?:\s+IPP\b|\s*$)", full_text, re.MULTILINE):
_add_name(m.group(1).strip())
# Nom et Prénom: DIEGO PATRICIA
for m in re.finditer(r"Nom\s+et\s+Pr[ée]nom\s*:\s*(.+?)(?:\s+Date\s+de\s+naissance|\s*$)", full_text, re.MULTILINE):
_add_name(m.group(1).strip())
# Prénom de naissance / Prénom utilisé : REGINA
for m in re.finditer(r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-']+?)(?:\s*$)", full_text, re.MULTILINE):
_add_name(m.group(1).strip())
# Lieu de naissance: BAYONNE, biarritz, 64102, 99999 → masquer comme VILLE
for m in re.finditer(r"Lieu\s+de\s+naissance\s*:\s*(\S[^\n]*?)(?:\s*$)", full_text, re.MULTILINE):
val = m.group(1).strip()
if val:
hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"]))
# Ajouter au set names seulement si alphabétique (pas les codes INSEE numériques)
if re.match(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç]", val):
names.add(val)
# Ville de résidence: TARNOS → masquer comme VILLE
for m in re.finditer(r"Ville\s+de\s+r[ée]sidence\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\s\-']+?)(?:\s*$)", full_text, re.MULTILINE):
val = m.group(1).strip()
hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"]))
names.add(val)
# Code Postal (toutes occurrences)
for m in re.finditer(r"[Cc]ode\s*[Pp]ostal\s*:\s*(\d{5})", full_text):
hits.append(PiiHit(-1, "CODE_POSTAL", m.group(1), PLACEHOLDERS["CODE_POSTAL"]))
# N° épisode (= NDA, identifiant de séjour)
for m in re.finditer(r"Episode\s*N[o°.]?\s*\.?\s*:\s*(\d{5,})", full_text):
hits.append(PiiHit(-1, "EPISODE", m.group(1), PLACEHOLDERS.get("NDA", "[NDA]")))
# RPPS isolés (11 chiffres commençant par 1 ou 2, seul sur une ligne ou en fin de ligne)
for m in re.finditer(r"^\s*([12]\d{10})\s*$", full_text, re.MULTILINE):
hits.append(PiiHit(-1, "RPPS", m.group(1), PLACEHOLDERS["RPPS"]))
# Adresse patient (toutes les occurrences)
for m in re.finditer(r"Adresse\s*:\s*(.+?)(?:\s+Ville\s+de\s+r[ée]sidence|\s*$)", full_text, re.MULTILINE):
val = m.group(1).strip()
if len(val) > 3:
hits.append(PiiHit(-1, "ADRESSE", val, PLACEHOLDERS["ADRESSE"]))
# --- Pied de page : "Patient : NOM PRENOM - Date de naissance..." ---
for m in re.finditer(r"Patient\s*:\s*(.+?)\s*-\s*Date\s+de\s+naissance", full_text):
_add_name(m.group(1).strip())
# --- Médecin courant (toutes occurrences) ---
for m in re.finditer(r"Médecin\s+courant\s*:\s*(?:DR\.?\s*)?(.+?)(?:\s*$)", full_text, re.MULTILINE):
_add_name(m.group(1).strip())
# --- Médecin traitant (ligne après "Nom Adresse Téléphone") ---
for m in re.finditer(r"Médecin\s+traitant\s*\n.*?Nom\s+Adresse\s+Téléphone\s*\n\s*(?:DR\.?\s*)?(.+?)(?:\d{5}|\s*$)", full_text, re.MULTILINE):
_add_name(m.group(1).strip())
# --- Contacts structurés ---
# Pattern: Relation NOM PRENOM [ADRESSE] [TEL]
# Accepte les minuscules (Trackare écrit parfois "Conjoint vandestock michele")
# Capture jusqu'à 3 tokens pour les noms composés (le moigne christophe)
for m in re.finditer(
r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur)\s+"
r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?",
full_text,
):
contact_parts = [g.strip(" .-'(),") for g in (m.group(1), m.group(2), m.group(3)) if g]
# Ajouter chaque token >= 3 chars (pas les articles courts comme "le", "di")
for tok in contact_parts:
if len(tok) >= 3 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
names.add(tok)
if tok[0].islower():
names.add(tok.capitalize())
# Ajouter aussi le composé complet (pour "le moigne", "di lullo")
if len(contact_parts) >= 2:
compound = " ".join(contact_parts)
if len(compound) >= 5:
names.add(compound)
# Version capitalisée pour propagation
names.add(" ".join(t.capitalize() for t in compound.split()))
# --- Prescripteurs / Exécutants (trackare) ---
for m in re.finditer(
r"(?:Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*"
r"(?:(?:Dr|Pr)\.?\s+)?"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+))?",
full_text,
):
_add_name(m.group(1))
if m.group(2):
_add_name(m.group(2))
# --- Médecins urgences (IAO, prise en charge, décision) ---
for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)", full_text):
_add_name(m.group(1))
for m in re.finditer(
r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+"
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+))?",
full_text,
):
_add_name(m.group(1))
if m.group(2):
_add_name(m.group(2))
# --- Noms soignants dans les Notes d'évolution / Notes IDE / Notes médicales ---
# Pattern: "Note IDE\nPrenom NOM" ou "Note d'évolution\nPrenom NOM"
for m in re.finditer(
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s*\n\s*"
r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][a-zéèàùâêîôûäëïöüç]+)\s+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)",
full_text
):
prenom, nom = m.group(1), m.group(2)
if prenom.lower() not in _MEDICAL_STOP_WORDS_SET:
_add_name(prenom)
if nom.lower() not in _MEDICAL_STOP_WORDS_SET:
_add_name(nom)
# --- Noms soignants multi-lignes : "Prénom\nNOM" dans les tableaux de prescriptions/soins ---
for m in re.finditer(
r'\b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛ]{3,})\b',
full_text
):
prenom, nom = m.group(1), m.group(2)
if prenom.lower() not in _MEDICAL_STOP_WORDS_SET and nom.lower() not in _MEDICAL_STOP_WORDS_SET:
_add_name(prenom)
_add_name(nom)
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
for m in re.finditer(
r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
r"(?:DR\.?[ \t]+)?"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text
):
for g in (m.group(1), m.group(2)):
if g:
tok = g.rstrip('-')
if len(tok) >= 3:
_add_name_force(tok)
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
for m in re.finditer(
r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text
):
for g in (m.group(1), m.group(2)):
if g:
tok = g.rstrip('-')
if len(tok) >= 3:
_add_name_force(tok)
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
for m in re.finditer(
r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
full_text
):
for g in (m.group(1), m.group(2)):
if g:
tok = g.rstrip('-')
if len(tok) >= 3 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
_add_name(tok)
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
for m in re.finditer(
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})",
full_text
):
tok = m.group(1).rstrip('-')
if len(tok) >= 3 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
_add_name(tok)
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
for m in re.finditer(
r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
full_text
):
for g in (m.group(1), m.group(2)):
if g:
tok = g.strip()
if len(tok) >= 3:
_add_name_force(tok)
# --- Noms soignants après timestamps dans activités de soins (ex: "07:00 ETCHEBARNE") ---
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
for m in re.finditer(
r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
full_text
):
for g in (m.group(1), m.group(2)):
if g:
tok = g.rstrip('-')
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
_add_name(tok)
# Filtrer les tokens trop courts ou stop words
# Exceptions : force_names (contextes structurés) et city_tokens (villes extraites)
city_tokens = {h.original for h in hits if h.kind == "VILLE"}
filtered = set()
for tok in names:
if tok in city_tokens or tok in force_names:
filtered.add(tok)
continue
if len(tok) < 3:
continue
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
continue
filtered.add(tok)
return filtered, hits, force_names
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]:
"""Pré-scan du document brut pour extraire les noms de personnes
depuis les champs structurés (Patient, Rédigé par, etc.).
Retourne (names, force_names) : ensemble de tokens à masquer,
et sous-ensemble qui bypass les stop words."""
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
names: set = set()
force_names: set = set()
def _add_compound(match_str: str):
"""Ajoute le nom composé complet en plus des tokens individuels (DI LULLO, LE MOIGNE)."""
parts = [t.strip(" .-'") for t in match_str.split() if len(t.strip(" .-'")) >= 2]
if len(parts) >= 2:
compound = " ".join(parts)
if len(compound) >= 5:
names.add(compound)
def _add_tokens(match_str: str):
_add_compound(match_str)
for token in match_str.split():
token = token.strip(" .-'")
if len(token) < 3:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
names.add(token)
def _add_tokens_force_all(match_str: str):
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
_add_compound(match_str)
for token in match_str.split():
token = token.strip(" .-'")
if len(token) < 2:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
names.add(token)
force_names.add(token)
def _add_tokens_force_first(match_str):
"""Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
_add_compound(match_str)
tokens = match_str.split()
for i, token in enumerate(tokens):
token = token.strip(" .-'")
if len(token) < 2:
continue
if token.upper() in wl_sections or token in wl_phrases:
continue
if token.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if i == 0:
# Premier token après Dr/Mme : contexte fiable
names.add(token)
else:
if len(token) < 3:
continue
names.add(token)
for m in RE_EXTRACT_PATIENT.finditer(full_text):
_add_tokens_force_all(m.group(1))
for m in RE_EXTRACT_REDIGE.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_MME_MR.finditer(full_text):
_add_tokens_force_first(m.group(1))
for m in RE_EXTRACT_DR_DEST.finditer(full_text):
_add_tokens_force_first(m.group(1))
# Champs d'identité structurés (trackare / DPI)
for m in RE_EXTRACT_NOM_NAISSANCE.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_NOM_PRENOM.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_LIEU_NAISSANCE.finditer(full_text):
_add_tokens(m.group(1))
for m in RE_EXTRACT_VILLE_RESIDENCE.finditer(full_text):
_add_tokens(m.group(1))
# Contacts structurés (conjoint, concubin, etc.)
for m in RE_EXTRACT_CONTACT.finditer(full_text):
_add_tokens(m.group(1))
if m.group(2):
_add_tokens(m.group(2))
# Personnel médical avec rôle (Aide, Cadre Infirmier, Prescripteur, etc.)
for m in RE_EXTRACT_STAFF_ROLE.finditer(full_text):
_add_tokens(m.group(1))
# Pr / Professeur + nom(s)
for m in RE_EXTRACT_PR.finditer(full_text):
_add_tokens_force_first(m.group(1))
# Opérateur / Anesthésiste / Chirurgien + nom(s)
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
_add_tokens_force_first(m.group(1))
# Extraction des noms dans les listes virgulées après Dr/Docteur
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
for m in RE_DR_COMMA_LIST.finditer(full_text):
fragment = m.group(0)
parts = [p.strip() for p in fragment.split(",")]
for part in parts:
for tok in _NAME_TOKEN_RE.findall(part):
tok = tok.strip(" .-'")
if len(tok) < 3:
continue
if tok.upper() in wl_sections or tok in wl_phrases:
continue
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
continue
names.add(tok)
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
# ajouter aussi les parties individuelles pour capturer les occurrences standalone.
# _apply_extracted_names traite le composé en premier (plus long) puis les parties.
compound_names = {n for n in names if "-" in n}
for compound in compound_names:
for part in compound.split("-"):
part = part.strip()
if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET:
names.add(part)
return names, force_names
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
"""Remplace globalement chaque nom extrait dans le texte."""
placeholder = PLACEHOLDERS["NOM"]
_force = force_names or set()
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
for token in sorted(safe_names, key=len, reverse=True):
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
for token in sorted(safe_names, key=len, reverse=True):
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
new_text = []
last_end = 0
for m in pattern.finditer(text):
# Ne pas remplacer si déjà dans un placeholder
ctx_start = max(0, m.start() - 1)
ctx_end = min(len(text), m.end() + 1)
if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]:
continue
# Ne pas remplacer si le token fait partie d'un mot composé (tiret + lettre)
# Ex: "NOCENT-EJNAINI" → ne pas remplacer NOCENT seul
# Mais "LACLAU-" (tiret de troncature) → remplacer
if m.start() > 0 and text[m.start() - 1] == "-":
if m.start() >= 2 and text[m.start() - 2].isalpha():
continue
if m.end() < len(text) and text[m.end()] == "-":
if m.end() + 1 < len(text) and text[m.end() + 1].isalpha():
continue
# DÉSACTIVÉ: NOM_EXTRACTED génère 3,846 FP (77.7% du total) avec 0 TP
# Cette logique d'extraction de noms est trop agressive et crée des faux positifs massifs
# audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder))
new_text.append(text[last_end:m.start()])
new_text.append(placeholder)
last_end = m.end()
new_text.append(text[last_end:])
text = "".join(new_text)
return text
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.).
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
_APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"}
# Collecter les valeurs à remplacer, groupées par placeholder
replacements: Dict[str, str] = {} # original → placeholder
for h in audit:
if h.kind in _APPLY_KINDS and h.original and len(h.original.strip()) >= 4:
replacements[h.original.strip()] = h.placeholder
# Remplacer les plus longs d'abord (éviter les remplacements partiels)
for original in sorted(replacements, key=len, reverse=True):
placeholder = replacements[original]
escaped = re.escape(original)
# Word boundary pour ne pas casser les mots (ex: ONDANSETRON)
text = re.sub(rf"\b{escaped}\b", placeholder, text)
# Aussi gérer les formats avec astérisques (*640000162*)
text = re.sub(rf"\*{escaped}\*", placeholder, text)
return text
# ----------------- Anonymisation (regex) -----------------
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
audit: List[PiiHit] = []
# Phase 0 : extraction globale des noms depuis les champs structurés
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
"\n".join(rows) for rows in tables_lines
)
extracted_names, doc_force_names = _extract_document_names(full_raw, cfg)
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
is_trackare = _is_trackare_document(full_raw)
trackare_force_names: set = set()
if is_trackare:
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
extracted_names.update(trackare_names)
audit.extend(trackare_hits)
# Fusionner les force_names des deux sources
all_force_names = doc_force_names | trackare_force_names
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
_RE_FINESS_MULTILINE = re.compile(
r"(?:N°\s*)?[Ff]iness?\s*\n(?:[^\n]*\n){0,2}\s*\*?(\d{9})\*?", re.MULTILINE
)
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
re.IGNORECASE,
)
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
_RE_IPP_MULTILINE = re.compile(
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
re.IGNORECASE,
)
for m in _RE_IPP_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
_RE_DEMANDE_MULTILINE = re.compile(
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
re.IGNORECASE,
)
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
# Phase 1 : masquage ligne par ligne (regex classiques)
out_pages: List[str] = []
for i, page_txt in enumerate(pages_text):
lines = [ln for ln in (page_txt or "").splitlines()]
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
out_pages.append("\n".join(masked))
table_blocks: List[str] = []
for i, rows in enumerate(tables_lines):
mbuf: List[str] = []
for r in rows:
masked = _kv_value_only_mask(r, audit, i, cfg)
mbuf.append(masked)
if mbuf:
table_blocks.append("\n".join(mbuf))
tables_block = "\n\n".join(table_blocks)
text_out = "\f".join(out_pages) # séparateur de pages
# NOTE: on n'ajoute PAS le bloc [TABLES] au text_out.
# pdfplumber extrait souvent le contenu principal comme "table", créant un doublon
# intégral du texte. Ce doublon échappait au NER et au rescan (protégé par les
# marqueurs [TABLES]), et le NER EDS-pseudo corrompait les marqueurs en changeant
# la longueur du texte → fuite PII massive (dates de naissance, adresses, noms).
# Les PII détectés dans les tables sont toujours dans l'audit (Phase 1 regex).
# Phase 2 : application globale des noms extraits (rattrapage)
if extracted_names:
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
text_out = _apply_trackare_hits_to_text(text_out, audit)
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit, is_trackare=is_trackare)
# ----------------- NER ONNX sur narratif -----------------
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
# remplace via regex sur les 'word' détectés (approche pragmatique)
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False))
def repl_once(s: str, old: str, new: str) -> str:
return re.sub(rf"\b{re.escape(old)}\b", new, s)
out = text
for e in ents:
w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper()
if not w or "[" in w or "]" in w: # ignore placeholders
continue
if len(w) <= 2: # trop court
continue
if grp in {"PER", "PERSON"}:
audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"]))
out = repl_once(out, w, PLACEHOLDERS["NOM"])
elif grp in {"ORG"}:
if keep_org_gpe:
continue
audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"]))
out = repl_once(out, w, PLACEHOLDERS["ETAB"])
elif grp in {"LOC"}:
if keep_org_gpe:
continue
audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"]))
out = repl_once(out, w, PLACEHOLDERS["VILLE"])
elif grp in {"DATE"}:
# facultatif : si vous masquez déjà les dates via règles, laissez tel quel
continue
return out
def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]:
if manager is None or not manager.is_loaded():
return text_out, []
# isoler [TABLES]
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
tables: List[Tuple[int,int,str]] = []
keep = []
last = 0
cleaned = ""
for m in pattern.finditer(text_out):
cleaned += text_out[last:m.start()]
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
cleaned += "\x00" * len(m.group(0))
last = m.end()
cleaned += text_out[last:]
# par pages (séparées par \f) → par paragraphes
pages = cleaned.split("\f")
hits: List[PiiHit] = []
rebuilt_pages: List[str] = []
for pg in pages:
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds)
# remplace entités
idx = 0
buf = []
for para, ents in zip(paras, ents_per_para):
masked = _mask_with_hf(para, ents, cfg, hits)
buf.append(masked)
rebuilt_pages.append("\n\n".join(buf))
rebuilt = "\f".join(rebuilt_pages)
# réinsérer [TABLES]
rebuilt_list = list(rebuilt)
for start, end, payload in keep:
rebuilt_list[start:end] = list(payload)
final = "".join(rebuilt_list)
return final, hits
# ----------------- NER EDS-Pseudo sur narratif -----------------
def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
"""Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key."""
def repl_once(s: str, old: str, new: str) -> str:
return re.sub(rf"\b{re.escape(old)}\b", new, s)
out = text
for e in ents:
w = e.get("word") or ""
mapped_key = e.get("eds_mapped_key", "")
if not w or "[" in w or "]" in w:
continue
if len(w) <= 2:
continue
# Filtrer les faux positifs NOM/PRENOM (médicaments, acronymes médicaux)
label = e.get("entity_group", "EDS")
if label in ("NOM", "PRENOM", "HOPITAL", "VILLE"):
if w.lower() in _MEDICAL_STOP_WORDS_SET:
continue
# Filtrer aussi les tokens multi-mots dont un composant est un stop word
if " " in w and any(part.lower() in _MEDICAL_STOP_WORDS_SET for part in w.split()):
continue
# Filtrer les dosages détectés comme noms (ex: "10MG", "300UI", "1 000")
if re.match(r"^\d[\d\s]*(?:mg|MG|ml|ML|UI|µg|mcg|g|kg|%)?$", w.strip()):
continue
# CORRECTION 1.2: Filtrer les médicaments détectés comme NOM/PRENOM
if label in ("NOM", "PRENOM"):
# Vérifier si c'est un médicament connu
if w.lower() in _MEDICATION_WHITELIST:
continue
# Chantier 3+4+5 : Confiance NER + vote croisé GLiNER + CamemBERT-bio + gazetteers INSEE
# Sécurité d'abord : haute confiance NER → toujours masquer
# GLiNER/CamemBERT peuvent rejeter SEULEMENT si confiance NER basse
gliner_vote = e.get("gliner_confirmed") # True=PII, False=médical, None=neutre
camembert_vote = e.get("camembert_confirmed") # True=PII confirmé, False=non détecté, None=neutre
if label in ("NOM", "PRENOM"):
score = e.get("score", 1.0)
# Gazetteer INSEE : prénom connu = renforcement confiance (ne pas filtrer)
is_known_prenom = w.lower() in _INSEE_PRENOMS
if isinstance(score, float) and score < 0.70 and not is_known_prenom:
# Basse confiance NER + pas un prénom connu
if gliner_vote is False and camembert_vote is not True:
continue # GLiNER dit "médical" + CamemBERT ne confirme pas → skip
if score < 0.30 and camembert_vote is not True:
continue # Très basse confiance + CamemBERT ne confirme pas → skip
# Chantier 2 : Safe patterns contextuels (Philter-style)
# Token suivi/précédé de dosages ou formes pharma → jamais un nom de personne
pos = text.find(w)
if pos >= 0:
# Contexte MÊME LIGNE seulement ([ \t] pas \n)
line_start = text.rfind('\n', 0, pos)
line_start = 0 if line_start < 0 else line_start + 1
line_end = text.find('\n', pos + len(w))
line_end = len(text) if line_end < 0 else line_end
ctx_before = text[max(line_start, pos - 30):pos]
ctx_after = text[pos + len(w):min(line_end, pos + len(w) + 30)]
# Safe pattern: précédé ou suivi d'un dosage (mg, mL, UI, comprimé, etc.)
_RE_DOSAGE = r"\d+[ \t]*(?:mg|ml|ui|µg|mcg|g|kg|cp|cpr|gel|amp|fl|dos|inh)\b"
if re.search(_RE_DOSAGE, ctx_before, re.IGNORECASE):
continue
if re.search(_RE_DOSAGE, ctx_after, re.IGNORECASE):
continue
# Safe pattern: suivi d'une forme pharmaceutique
_RE_PHARMA_FORM = r"^\s*(?:comprim[ée]s?|g[ée]lules?|sachets?|ampoules?|flacons?|solutions?|injectable|suppo(?:sitoire)?s?|sirop|pommade|cr[eè]me|gouttes?|patch|inhal)"
if re.search(_RE_PHARMA_FORM, ctx_after, re.IGNORECASE):
continue
# Safe pattern: précédé de "taux de", "score de", "dosage de"
if re.search(r"(?:taux|score|dosage|indice|index|grade|stade|type)\s+(?:de\s+)?$", ctx_before, re.IGNORECASE):
continue
elif label == "HOPITAL":
_STRUCTURAL_WORDS = {"SERVICE", "POLE", "PÔLE", "UNITE", "UNITÉ", "SECTEUR"}
if len(w) < 5:
continue
if w.upper() in _STRUCTURAL_WORDS:
continue
placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"])
audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder))
out = repl_once(out, w, placeholder)
return out
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager",
gliner_mgr: Any = None,
camembert_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
"""Applique EDS-Pseudo sur le narratif avec validation croisée GLiNER optionnelle."""
if manager is None or not manager.is_loaded():
return text_out, []
# isoler [TABLES]
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
keep = []
last = 0
cleaned = ""
for m in pattern.finditer(text_out):
cleaned += text_out[last:m.start()]
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
cleaned += "\x00" * len(m.group(0))
last = m.end()
cleaned += text_out[last:]
# par pages → par paragraphes
pages = cleaned.split("\f")
hits: List[PiiHit] = []
rebuilt_pages: List[str] = []
for pg in pages:
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
ents_per_para = manager.infer_paragraphs(paras)
# Chantier 4 : Validation croisée GLiNER (vote majoritaire)
if gliner_mgr is not None and hasattr(gliner_mgr, 'validate_entities') and gliner_mgr.is_loaded():
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
ents_per_para[i] = gliner_mgr.validate_entities(para, ents, threshold=0.4)
# Chantier 5 : Validation croisée CamemBERT-bio (vote NER fine-tuné)
if camembert_mgr is not None and hasattr(camembert_mgr, 'validate_eds_entities') and camembert_mgr.is_loaded():
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
ents_per_para[i] = camembert_mgr.validate_eds_entities(para, ents, threshold=0.3)
buf = []
for para, ents in zip(paras, ents_per_para):
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
buf.append(masked)
rebuilt_pages.append("\n\n".join(buf))
rebuilt = "\f".join(rebuilt_pages)
# réinsérer [TABLES]
rebuilt_list = list(rebuilt)
for start, end, payload in keep:
rebuilt_list[start:end] = list(payload)
final = "".join(rebuilt_list)
return final, hits
# ----------------- FINESS Aho-Corasick establishment matching -----------------
def _build_finess_ac():
"""Construit l'automate Aho-Corasick FINESS (appelé en lazy au premier besoin)."""
global _FINESS_AC
if not _AHO_AVAILABLE:
return
data_dir = Path(__file__).parent / "data" / "finess"
dist_path = data_dir / "etablissements_distinctifs.txt"
if not dist_path.exists():
return
# Mots génériques qui ne doivent jamais être matchés seuls
_ac_generic_blacklist = {
# Types d'établissements
"clinique", "pharmacie", "hopital", "centre", "foyer",
"residence", "maison", "cabinet", "service", "laboratoire",
"institut", "association", "fondation", "mutuelle", "polyclinique",
"dispensaire", "hospice", "annexe", "antenne", "site",
# Mots français courants qui sont aussi des noms d'établissements
"collegiale", "collegial", "cathedral", "cathedrale",
"providence", "esperance", "renaissance", "liberation",
"republique", "fraternite", "solidarite", "independance",
"beauregard", "bellevue", "belvedere",
"promenade", "esplanade", "corniche", "prefecture",
"croissant", "confluence", "bienvenue",
"chartreuse", "commanderie", "chapelle", "basilique",
"departement", "departementale", "communautaire",
}
try:
ac = _ahocorasick.Automaton()
count = 0
for line in dist_path.read_text(encoding="utf-8").splitlines():
name = line.strip()
if not name:
continue
# Exclure les mots génériques seuls
if name in _ac_generic_blacklist:
continue
words = name.split()
# Exclure les 2-mots dont le 1er est générique ET le 2e < 5 chars
if len(words) == 2 and words[0] in _ac_generic_blacklist and len(words[1]) < 5:
continue
# Filtrer : >= 8 chars et >= 2 mots, OU >= 10 chars pour 1 mot
# Les noms courts sont gérés par RE_HOPITAL_VILLE
if len(words) >= 2 and len(name) >= 8:
ac.add_word(name, name)
count += 1
elif (len(words) == 1 and len(name) >= 10
and name not in _ac_generic_blacklist
and name not in _MEDICAL_STOP_WORDS_SET
and _normalize_for_matching(name) not in _MEDICAL_STOP_WORDS_SET):
ac.add_word(name, name)
count += 1
ac.make_automaton()
_FINESS_AC = ac
log.info(f"Gazetteer FINESS Aho-Corasick: {count} patterns chargés")
except Exception as e:
log.warning(f"Erreur construction FINESS Aho-Corasick: {e}")
def _normalize_positional(text: str) -> str:
"""Normalise en préservant la longueur : lowercase + accents → base char.
Chaque caractère accentué est remplacé par sa version sans accent.
Les caractères non-alphanumériques restent tels quels (même position).
Longueur de sortie == longueur d'entrée.
"""
import unicodedata
out = []
for ch in text:
# Lowercase
ch = ch.lower()
# Décomposer et retirer les accents
decomposed = unicodedata.normalize("NFD", ch)
base = "".join(c for c in decomposed if unicodedata.category(c) != "Mn")
out.append(base if base else ch)
return "".join(out)
def _mask_finess_establishments(text: str) -> str:
"""Masque les noms d'établissements FINESS détectés par Aho-Corasick.
Scanne le texte normalisé (position-preserving: même longueur) et remplace
les occurrences trouvées dans le texte original par [ETABLISSEMENT].
Seuls les matches sur des frontières de mots sont acceptés.
"""
global _FINESS_AC
if _FINESS_AC is None:
_build_finess_ac()
if _FINESS_AC is None:
return text
normalized = _normalize_positional(text)
placeholder = PLACEHOLDERS["ETAB"]
# Collecter les matches Aho-Corasick (position fin, nom)
matches = []
for end_idx, name in _FINESS_AC.iter(normalized):
start_idx = end_idx - len(name) + 1
# Vérifier frontières de mots (pas au milieu d'un mot)
if start_idx > 0 and normalized[start_idx - 1].isalnum():
continue
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
continue
# Vérifier que ce n'est pas déjà dans un placeholder
ctx_before = text[max(0, start_idx - 1):start_idx]
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
if "[" in ctx_before or "]" in ctx_after:
continue
matches.append((start_idx, end_idx + 1, name))
if not matches:
return text
# Trier par position, dédupliquer (garder le plus long en cas de chevauchement)
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
deduped = []
last_end = 0
for start, end, name in matches:
if start >= last_end:
deduped.append((start, end, name))
last_end = end
# Reconstruire le texte avec les remplacements (positions 1:1 avec l'original)
result = []
last_pos = 0
for start, end, name in deduped:
if start > len(text) or end > len(text):
continue
result.append(text[last_pos:start])
result.append(placeholder)
last_pos = end
result.append(text[last_pos:])
return "".join(result)
# ----------------- Selective safety rescan -----------------
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
"""Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage."""
# enlève TABLES du scope
def strip_tables(s: str):
kept = []
out = []
i = 0
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
for m in pattern.finditer(s):
out.append(s[i:m.start()])
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
out.append("\x00" * (m.end() - m.start()))
i = m.end()
out.append(s[i:])
return "".join(out), kept
protected, kept = strip_tables(text)
# PII critiques (comme avant)
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
# NIR avec validation
def _rescan_nir(m: re.Match) -> str:
return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0)
protected = RE_NIR.sub(_rescan_nir, protected)
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected)
# protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
# N° Episode
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
# N° RPPS
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
if _FINESS_NUMBERS:
def _rescan_finess(m: re.Match) -> str:
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
# Établissements (regex)
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
# Établissements (gazetteer Aho-Corasick FINESS — 116K noms distinctifs)
if _FINESS_AC is not None:
protected = _mask_finess_establishments(protected)
# Services hospitaliers
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
# Lieu de naissance / Ville de résidence (accepte tout : villes, codes INSEE, minuscules)
_re_lieu_rescan = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)(\S.+)")
protected = _re_lieu_rescan.sub(lambda m: m.group(1) + PLACEHOLDERS["VILLE"], protected)
_re_ville_rescan = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)(\S.+)")
protected = _re_ville_rescan.sub(lambda m: m.group(1) + PLACEHOLDERS["VILLE"], protected)
# Personnes contextuelles (avec whitelist)
wl_sections = set()
wl_phrases = set()
if cfg:
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
def _rescan_person(m: re.Match) -> str:
span = m.group(1).strip(); raw = m.group(0)
if span in wl_sections or raw in wl_phrases:
return raw
tokens = [t for t in span.split() if t]
if len(tokens) == 1 and len(tokens[0]) <= 3:
return raw
# Filtrer les termes médicaux (stop words)
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
if not clean:
return raw
return raw.replace(span, PLACEHOLDERS["NOM"])
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
res = list(protected)
for start, end, payload in kept:
res[start:end] = list(payload)
return "".join(res)
# ----------------- PDF Redaction -----------------
def _search_ocr_words_fuzzy_digits(ocr_words: List[Tuple[str, float, float, float, float]],
token: str, page_rect, min_ratio: float = 0.7) -> list:
"""Matching flou pour identifiants numériques manuscrits.
Compare les séquences de chiffres entre le token VLM et les mots OCR.
Accepte une correspondance si ≥ min_ratio des chiffres matchent."""
token_digits = re.sub(r"[^0-9]", "", token)
if len(token_digits) < 4:
return []
rects = []
for (word, x0n, y0n, x1n, y1n) in ocr_words:
word_digits = re.sub(r"[^0-9]", "", word)
if len(word_digits) < 3:
continue
# Match exact des chiffres (après nettoyage)
if word_digits == token_digits:
rects.append(fitz.Rect(
x0n * page_rect.width, y0n * page_rect.height,
x1n * page_rect.width, y1n * page_rect.height,
))
continue
# Match partiel : le token est contenu dans le mot OCR ou vice-versa
if token_digits in word_digits or word_digits in token_digits:
if min(len(token_digits), len(word_digits)) / max(len(token_digits), len(word_digits)) >= min_ratio:
rects.append(fitz.Rect(
x0n * page_rect.width, y0n * page_rect.height,
x1n * page_rect.width, y1n * page_rect.height,
))
continue
# Match par distance : comparer caractère par caractère (Hamming-like)
if abs(len(word_digits) - len(token_digits)) <= 2:
shorter, longer = (word_digits, token_digits) if len(word_digits) <= len(token_digits) else (token_digits, word_digits)
matches = sum(1 for a, b in zip(shorter, longer) if a == b)
if matches / len(longer) >= min_ratio:
rects.append(fitz.Rect(
x0n * page_rect.width, y0n * page_rect.height,
x1n * page_rect.width, y1n * page_rect.height,
))
return rects
def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], token: str, page_rect) -> list:
"""Cherche un token dans les mots OCR d'une page.
Pour les tokens multi-mots, cherche chaque mot individuellement.
Retourne des fitz.Rect en coordonnées PDF points."""
rects = []
tokens_to_search = token.split() if " " in token else [token]
for t in tokens_to_search:
t_lower = t.lower().strip()
if not t_lower:
continue
for (word, x0n, y0n, x1n, y1n) in ocr_words:
if word.lower().strip(".,;:!?()") == t_lower:
rects.append(fitz.Rect(
x0n * page_rect.width,
y0n * page_rect.height,
x1n * page_rect.width,
y1n * page_rect.height,
))
return rects
def _search_whole_word(page, token: str) -> list:
"""Cherche un token comme mot entier (pas substring) via get_text('words').
Évite les faux positifs de page.search_for() qui fait du substring matching."""
rects = []
token_lower = token.lower().strip()
for w in page.get_text("words"):
# w = (x0, y0, x1, y1, word, block_no, line_no, word_no)
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
if word_text.lower() == token_lower:
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
return rects
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, ocr_word_map: OcrWordMap = None) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF non disponible installez pymupdf.")
doc = fitz.open(str(original_pdf))
# index hits par page; page==-1 → rechercher sur toutes pages
by_page: Dict[int, List[PiiHit]] = {}
for h in audit:
by_page.setdefault(h.page, []).append(h)
# Kinds à ne pas chercher dans le PDF (dates masquées uniquement dans le texte,
# pas dans le PDF où elles rendent les tableaux illisibles)
_VECTOR_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
# Kinds dont les tokens courts (< 5) risquent le substring matching via page.search_for()
_VECTOR_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
for pno in range(len(doc)):
page = doc[pno]
hits = by_page.get(pno, []) + by_page.get(-1, [])
if not hits:
continue
# Dédupliquer les tokens : (token, kind) → rechercher une seule fois par page
seen_tokens: set = set()
all_rects = []
for h in hits:
token = h.original.strip()
if not token:
continue
if h.kind in _VECTOR_SKIP_KINDS:
continue
# Clé de déduplication : le token lui-même (même token cherché une seule fois)
dedup_key = token
if dedup_key in seen_tokens:
continue
seen_tokens.add(dedup_key)
if h.kind in _VECTOR_SHORT_TOKEN_KINDS and len(token) < 5:
if token.lower() not in _MEDICAL_STOP_WORDS_SET:
rects = _search_whole_word(page, token)
if not rects and ocr_word_map and pno in ocr_word_map:
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
all_rects.extend(rects)
continue
rects = page.search_for(token)
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
compact = re.sub(r"\s+", "", token)
if compact != token:
rects = page.search_for(compact)
if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
for word in token.split():
word = word.strip(" .-'")
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if not word[0].isupper():
continue
rects.extend(page.search_for(word))
if not rects and ocr_word_map and pno in ocr_word_map:
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
all_rects.extend(rects)
# Appliquer toutes les annotations d'un coup (évite de ralentir search_for)
for r in all_rects:
page.add_redact_annot(r, fill=(0, 0, 0))
try:
page.apply_redactions()
except Exception:
pass
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
doc.close()
def _rasterize_page(args):
"""Worker parallèle : rasterise une page + dessine les rectangles noirs."""
pdf_path_str, pno, rects_tuples, dpi, ogc_label = args
doc = fitz.open(pdf_path_str)
src = doc[pno]
rect_w, rect_h = src.rect.width, src.rect.height
zoom = dpi / 72.0
pix = src.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
draw = ImageDraw.Draw(img)
shrink = 1.5
for (x0, y0, x1, y1) in rects_tuples:
rx0 = x0 * zoom + shrink
ry0 = y0 * zoom
rx1 = x1 * zoom - shrink
ry1 = y1 * zoom
if rx1 > rx0:
draw.rectangle([rx0, ry0, rx1, ry1], fill=(0, 0, 0))
if ogc_label:
from PIL import ImageFont
font_size = int(14 * zoom)
try:
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
except Exception:
font = ImageFont.load_default()
text = ogc_label if ogc_label.upper().startswith("OGC") else f"OGC: {ogc_label}"
bbox = draw.textbbox((0, 0), text, font=font)
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
margin = int(10 * zoom)
x = img.width - tw - margin
y = margin
draw.rectangle([x - 4, y - 2, x + tw + 4, y + th + 2], fill=(255, 255, 255))
draw.text((x, y), text, fill=(0, 0, 0), font=font)
buf = io.BytesIO()
img.save(buf, format="PNG")
doc.close()
return pno, buf.getvalue(), rect_w, rect_h
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 300, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None) -> None:
if fitz is None:
raise RuntimeError("PyMuPDF non disponible installez pymupdf.")
doc = fitz.open(str(original_pdf))
all_rects: Dict[int, List["fitz.Rect"]] = {}
_RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
_RASTER_SHORT_TOKEN_KINDS = {"NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL"}
_VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA",
"VLM_NIR", "VLM_IPP", "VLM_RPPS"}
by_page: Dict[int, List[PiiHit]] = {}
for h in audit:
by_page.setdefault(h.page, []).append(h)
for pno in range(len(doc)):
page = doc[pno]
rects = []
seen_tokens: set = set()
hits = by_page.get(pno, []) + by_page.get(-1, [])
# Masquage total si FULL_PAGE_MASK détecté (page manuscrite non déchiffrable)
if any(h.kind == "FULL_PAGE_MASK" and h.page == pno for h in hits):
margin = 5 # points — liseré fin autour du masque
rects.append(fitz.Rect(margin, margin, page.rect.width - margin, page.rect.height - margin))
all_rects[pno] = rects
continue
for h in hits:
token = h.original.strip()
if not token or h.kind in _RASTER_SKIP_KINDS:
continue
if token in seen_tokens:
continue
seen_tokens.add(token)
if h.kind in _RASTER_SHORT_TOKEN_KINDS and len(token) < 5:
if token.lower() not in _MEDICAL_STOP_WORDS_SET:
found_short = _search_whole_word(page, token)
if not found_short and ocr_word_map and pno in ocr_word_map:
found_short = _search_ocr_words(ocr_word_map[pno], token, page.rect)
rects.extend(found_short)
continue
found = page.search_for(token)
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
compact = re.sub(r"\s+", "", token)
found = page.search_for(compact)
if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
for word in token.split():
word = word.strip(" .-'")
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
found.extend(page.search_for(word))
# Fallback OCR pour chaque mot
if not found and ocr_word_map and pno in ocr_word_map:
found.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
if not found and ocr_word_map and pno in ocr_word_map:
found = _search_ocr_words(ocr_word_map[pno], token, page.rect)
# Matching flou pour identifiants numériques VLM (manuscrit)
if not found and h.kind in _VLM_NUMERIC_KINDS and ocr_word_map and pno in ocr_word_map:
found = _search_ocr_words_fuzzy_digits(ocr_word_map[pno], token, page.rect)
rects.extend(found)
all_rects[pno] = rects
# Phase 2 : rasterisation parallèle (ProcessPoolExecutor)
n_pages = len(doc)
rects_as_tuples = {
pno: [(r.x0, r.y0, r.x1, r.y1) for r in rects]
for pno, rects in all_rects.items()
}
doc.close() # fermer AVANT le fork
n_workers = min(n_pages, os.cpu_count() or 4)
tasks = [
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label)
for pno in range(n_pages)
]
with ProcessPoolExecutor(max_workers=n_workers) as pool:
results = sorted(pool.map(_rasterize_page, tasks), key=lambda x: x[0])
# Assemblage final (séquentiel, rapide)
out = fitz.open()
for pno, png_bytes, w, h in results:
dst = out.new_page(width=w, height=h)
dst.insert_image(fitz.Rect(0, 0, w, h), stream=png_bytes)
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
out.close()
# ----------------- VLM pour PDFs scannés -----------------
def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: OcrWordMap, vlm_manager) -> None:
"""Utilise un VLM (Ollama) pour détecter visuellement les PII sur chaque page d'un PDF scanné.
Les entités détectées sont ajoutées à anon.audit et au texte pseudonymisé.
Auto-rotation : si une page a peu de mots OCR, essaie 4 orientations."""
from vlm_manager import VLM_CATEGORY_MAP
doc = fitz.open(str(pdf_path))
# Collecter les PII déjà détectés pour contexte VLM
existing_pii = list({h.original.strip() for h in anon.audit if h.original.strip()})
# Catégories contenant des identifiants numériques (matching flou)
_NUMERIC_CATS = {"NUMERO_PATIENT", "NUMERO_LOT", "NUMERO_ORDONNANCE", "NUMERO_SEJOUR",
"NDA", "NIR", "IPP", "RPPS"}
# Catégories à splitter en mots (noms, services, établissements)
_SPLIT_CATS = {"NOM", "PRENOM", "ETABLISSEMENT", "SERVICE"}
for pno in range(len(doc)):
pix = doc[pno].get_pixmap(dpi=150)
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
ocr_count = len(ocr_word_map.get(pno, []))
is_handwritten_page = ocr_count < 100
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
if is_handwritten_page and ocr_count > 0:
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
placeholder=PLACEHOLDERS["MASK"]))
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
continue
# Pages lisibles : analyse VLM
best_entities = []
try:
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
existing_pii=existing_pii[:20])
except Exception:
best_entities = []
for ent in best_entities:
cat = ent.get("categorie", "").upper()
texte = ent.get("texte", "").strip()
conf = ent.get("confiance", 0.0)
if not texte or conf < 0.3:
continue
if cat not in VLM_CATEGORY_MAP:
continue
kind, placeholder_key = VLM_CATEGORY_MAP[cat]
placeholder = PLACEHOLDERS.get(placeholder_key, PLACEHOLDERS["MASK"])
if cat in _SPLIT_CATS:
# Splitter en mots pour meilleur matching OCR
for word in texte.split():
word = word.strip(" .-'(),")
if len(word) < 2 or word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
anon.audit.append(PiiHit(page=pno, kind=kind, original=word, placeholder=placeholder))
else:
anon.audit.append(PiiHit(page=pno, kind=kind, original=texte, placeholder=placeholder))
# Pour les identifiants numériques, ajouter aussi le token nettoyé (chiffres seuls)
if cat in _NUMERIC_CATS:
digits_only = re.sub(r"[^0-9]", "", texte)
if digits_only and digits_only != texte:
anon.audit.append(PiiHit(page=pno, kind=kind, original=digits_only, placeholder=placeholder))
# Remplacer dans le texte pseudonymisé si trouvé
try:
anon.text_out = re.sub(rf"\b{re.escape(texte)}\b", placeholder, anon.text_out)
except re.error:
anon.text_out = anon.text_out.replace(texte, placeholder)
doc.close()
# ----------------- Orchestration -----------------
def process_pdf(
pdf_path: Path,
out_dir: Path,
make_vector_redaction: bool = True,
also_make_raster_burn: bool = False,
config_path: Optional[Path] = None,
use_hf: bool = False,
ner_manager=None,
ner_thresholds=None,
ogc_label: Optional[str] = None,
vlm_manager=None,
gliner_manager=None,
camembert_manager=None,
) -> Dict[str, str]:
out_dir.mkdir(parents=True, exist_ok=True)
cfg = load_dictionaries(config_path)
pages_text, tables_lines, ocr_used, ocr_word_map = extract_text_with_fallback_ocr(pdf_path)
# 1) Regex rules
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
# 1b) VLM (optionnel) — sur les PDFs scannés uniquement
if ocr_used and vlm_manager is not None and VlmManager is not None:
try:
if vlm_manager.is_loaded():
_apply_vlm_on_scanned_pdf(pdf_path, anon, ocr_word_map, vlm_manager)
except Exception:
pass # dégradation gracieuse
# 2) NER (optionnel) — sur le narratif
final_text = anon.text_out
hf_hits: List[PiiHit] = []
if use_hf and ner_manager is not None and ner_manager.is_loaded():
# Détecter le type de manager et appeler la bonne fonction
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager, camembert_mgr=camembert_manager)
else:
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
anon.audit.extend(hf_hits)
# 3) Rescan selectif
final_text = selective_rescan(final_text, cfg=cfg)
# 3b) Nettoyage post-masquage : codes postaux orphelins (5 chiffres collés à un placeholder)
# et téléphones fragmentés sur plusieurs lignes
_re_cp_orphan = re.compile(r"(\[(?:ADRESSE|NOM|VILLE)\])\s*(\d{5})\b")
def _clean_cp_orphan(m):
anon.audit.append(PiiHit(-1, "CODE_POSTAL", m.group(2), PLACEHOLDERS["CODE_POSTAL"]))
return m.group(1) + PLACEHOLDERS["CODE_POSTAL"]
final_text = _re_cp_orphan.sub(_clean_cp_orphan, final_text)
# Téléphones fragmentés : "0X XX XX XX\nXX" coupé en fin de ligne (ligne suivante immédiate)
_re_tel_frag = re.compile(r"((?:\+33\s?|0)\d(?:[ .-]?\d){6,7})\s*\n\s*(\d{2}(?!\d))")
def _clean_tel_frag(m):
full = m.group(1).replace(" ", "").replace(".", "").replace("-", "") + m.group(2)
if len(full.replace("+33", "0")) == 10:
anon.audit.append(PiiHit(-1, "TEL", m.group(0).strip(), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"] + "\n"
return m.group(0)
final_text = _re_tel_frag.sub(_clean_tel_frag, final_text)
# Téléphones incomplets en fin de ligne (8 ou 9 chiffres au format 0X XX XX XX) : masquer la partie visible
_re_tel_partial = re.compile(r"(?<!\d)((?:\+33\s?|0)\d(?:[ .-]?\d){5,7})(?!\d)\s*$", re.MULTILINE)
def _clean_tel_partial(m):
digits = re.sub(r"[ .\-]", "", m.group(1))
if 8 <= len(digits) <= 9:
anon.audit.append(PiiHit(-1, "TEL", m.group(0).strip(), PLACEHOLDERS["TEL"]))
return PLACEHOLDERS["TEL"]
return m.group(0)
final_text = _re_tel_partial.sub(_clean_tel_partial, final_text)
# 4) Consolidation : propager les PII détectés sur toutes les pages (page=-1)
# pour que la redaction PDF les cherche partout (sidebar répété, etc.)
# 4a) Noms : extraire les tokens individuels
_nom_kinds = {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}
_global_name_tokens: set = set()
for h in anon.audit:
if h.kind not in _nom_kinds:
continue
for word in h.original.split():
word = word.strip(" .-'")
if len(word) < 3:
continue
if word.lower() in _MEDICAL_STOP_WORDS_SET:
continue
if not word[0].isupper():
continue
_global_name_tokens.add(word)
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
_COMPANION_BLACKLIST = {
"ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN",
"MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE",
"VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE",
# Spécialités/services
"CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
"CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
"GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE",
"OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE",
"RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE",
# Termes médicaux/courants FP OGC 21
"ALIMENTATION", "AUGMENTATION", "AMELIORATION",
"BILIAIRES", "BILIAIRE", "VOIES", "BILI",
"MEDECINE", "ENTERO", "DOSSIER", "AVIATION",
"SULFAMIDES", "CLAVULANIQUE", "MECILLINAM",
"TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES",
"CONTENTION", "ISOLEMENT", "ELIMINATION",
"PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE",
"SENSIBLE", "VARIABLE", "DOSAGE", "CAT",
}
raw_full = "\n\n".join(pages_text)
_companion_tokens: set = set()
for token in _global_name_tokens:
# Token connu suivi d'un mot ALL-CAPS
for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\b", raw_full):
candidate = m.group(1)
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
and candidate not in _global_name_tokens
and candidate not in _COMPANION_BLACKLIST):
_companion_tokens.add(candidate)
# Mot ALL-CAPS suivi du token connu
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\s+{re.escape(token)}\b", raw_full):
candidate = m.group(1)
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
and candidate not in _global_name_tokens
and candidate not in _COMPANION_BLACKLIST):
_companion_tokens.add(candidate)
_global_name_tokens.update(_companion_tokens)
# Retirer les sous-parties COURTES de noms composés (JEAN si JEAN-PIERRE existe)
# Garder les parties longues (>=5 chars) car le texte peut les scinder sur des lignes séparées
_compound = {t for t in _global_name_tokens if "-" in t}
_parts_to_drop = set()
for comp in _compound:
for part in comp.split("-"):
part = part.strip()
if len(part) >= 2 and len(part) < 5 and part in _global_name_tokens:
_parts_to_drop.add(part)
_global_name_tokens -= _parts_to_drop
# 4a-ter) Filtrage final des tokens globaux : rejeter les mots qui ne ressemblent pas à des noms propres
# - Mots courants français (minuscule initiale déjà filtrés en amont)
# - ALL-CAPS <= 4 chars confirmés par une seule source seulement
_nom_kind_counts: Dict[str, set] = {}
for h in anon.audit:
if h.kind in _nom_kinds:
for word in h.original.split():
word = word.strip(" .-'")
if word:
_nom_kind_counts.setdefault(word, set()).add(h.kind)
_filtered_global: set = set()
for token in _global_name_tokens:
# ALL-CAPS court (<=4) avec une seule source → probablement une abréviation
if token.isupper() and len(token) <= 4 and len(_nom_kind_counts.get(token, set())) < 2:
continue
_filtered_global.add(token)
_global_name_tokens = _filtered_global
# DÉSACTIVÉ: NOM_GLOBAL génère 670 FP avec 0 TP (100% faux positifs)
# La propagation globale des noms est trop agressive
# for token in _global_name_tokens:
# anon.audit.append(PiiHit(page=-1, kind="NOM_GLOBAL", original=token, placeholder=PLACEHOLDERS["NOM"]))
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER"}
_global_pii: Dict[str, set] = {}
for h in anon.audit:
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER",
"force_term", "force_regex", "FINESS"}:
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
if h.kind == "DATE_NAISSANCE":
# Extraire la date pure (DD/MM/YYYY ou DD/MM/YY)
date_match = re.search(r'(\d{1,2})[/.\-\s]+(\d{1,2})[/.\-\s]+(\d{2,4})', h.original)
if date_match:
day, month, year = date_match.groups()
# Normaliser les composants (ajouter zéro si nécessaire)
day = day.zfill(2)
month = month.zfill(2)
# Générer toutes les variations de séparateurs
date_variations = [
f"{day}/{month}/{year}",
f"{day}.{month}.{year}",
f"{day}-{month}-{year}",
f"{day} {month} {year}",
]
for var in date_variations:
_global_pii.setdefault(h.kind, set()).add(var)
else:
# Fallback : ajouter tel quel si pas de match
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
else:
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
# Propager UNIQUEMENT les PII critiques (évite les 951 FP des autres types)
for kind, values in _global_pii.items():
if kind not in _CRITICAL_PII_TYPES:
continue # Skip non-critical PII (TEL, ADRESSE, etc.)
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
for val in values:
if not val or len(val) < 3: # Skip valeurs trop courtes
continue
anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder))
log.info("Propagation globale sélective : %d types critiques propagés",
sum(1 for k in _global_pii.keys() if k in _CRITICAL_PII_TYPES))
# 4e) Appliquer les tokens globaux sur le texte pseudonymisé
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL"}
for h in anon.audit:
if h.page != -1:
continue
if not (h.kind == "NOM_GLOBAL" or h.kind.endswith("_GLOBAL")):
continue
if h.kind in _GLOBAL_SKIP_KINDS:
continue
token = h.original.strip()
if not token or len(token) < 3:
continue
# Garde trackare : NOM_GLOBAL très court (<=3) risque de masquer des codes diagnostics
if anon.is_trackare and h.kind == "NOM_GLOBAL" and len(token) <= 3:
continue
try:
# Traitement spécial pour DATE_NAISSANCE_GLOBAL : gérer les variations de format et contexte
if h.kind == "DATE_NAISSANCE_GLOBAL":
# Extraire les composants de la date (DD/MM/YYYY ou variations)
date_match = re.search(r'(\d{1,2})[/.\-\s]+(\d{1,2})[/.\-\s]+(\d{2,4})', token)
if date_match:
day, month, year = date_match.groups()
# Pattern flexible qui accepte tous les séparateurs
# [\s/.\-]+ accepte : espace, slash, point, tiret (un ou plusieurs)
date_pattern = rf'{day}[\s/.\-]+{month}[\s/.\-]+{year}'
# Multi-pass replacement pour couvrir tous les cas
# Pass 1 : Avec contexte "Né(e) le" (case-insensitive)
final_text = re.sub(
rf'Né(?:e)?\s+le\s+{date_pattern}',
h.placeholder,
final_text,
flags=re.IGNORECASE
)
# Pass 2 : Sans contexte (date seule)
final_text = re.sub(
rf'\b{date_pattern}\b',
h.placeholder,
final_text,
flags=re.IGNORECASE
)
continue
# Traitement spécial pour force_term : remplacement case-insensitive avec word boundaries
if h.kind == "force_term_GLOBAL":
# Échapper les caractères spéciaux mais garder la flexibilité
pat = re.escape(token)
final_text = re.sub(rf'\b{pat}\b', h.placeholder, final_text, flags=re.IGNORECASE)
continue
# Traitement standard pour les autres types
pat = re.escape(token)
# Noms composés : tolérer les sauts de ligne/espaces autour du tiret
if "-" in token:
pat = pat.replace(r"\-", r"\-\s*")
# Dates : tolérer variations de séparateurs
if "/" in token or "." in token:
pat = pat.replace(r"\.", r"[\s/.\-]").replace(r"\/", r"[\s/.\-]")
final_text = re.sub(rf"\b{pat}\b", h.placeholder, final_text, flags=re.IGNORECASE)
except re.error:
final_text = final_text.replace(token, h.placeholder)
# Log OCR dans l'audit
if ocr_used:
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
# Filtrer les faux positifs hospitaliers
if _HOSPITAL_FILTER_AVAILABLE:
try:
hospital_filter = HospitalFilter()
original_count = len(anon.audit)
# Convertir les PiiHit en format dict pour le filtre
detections = [
{
'kind': hit.kind,
'original': hit.original,
'page': hit.page
}
for hit in anon.audit
]
# Filtrer (passer le flag is_trackare)
filtered_detections = hospital_filter.filter_detections(
detections,
pdf_path.name,
is_trackare=anon.is_trackare
)
# Reconstruire la liste anon.audit
filtered_audit = []
for det in filtered_detections:
# Trouver le PiiHit original correspondant
for hit in anon.audit:
if (hit.kind == det['kind'] and
hit.original == det['original'] and
hit.page == det['page']):
filtered_audit.append(hit)
break
anon.audit = filtered_audit
filtered_count = original_count - len(anon.audit)
if filtered_count > 0:
log.info("Filtre hospitalier : %d faux positifs éliminés", filtered_count)
except Exception as e:
log.warning("Erreur lors du filtrage hospitalier : %s", e)
# Sécurité : supprimer tout bloc [TABLES] résiduel (ne devrait plus arriver)
final_text = re.sub(r"\n*\[TABLES\].*?\[/TABLES\]\n*", "\n", final_text, flags=re.DOTALL)
# Nettoyage crochets doubles : [[PLACEHOLDER]] → [PLACEHOLDER] (artefact quand
# le PDF original avait déjà des crochets autour de la valeur masquée)
_RE_BRACKET_CLEAN = re.compile(
r"\[+(\[(?:NOM|TEL|EMAIL|VILLE|ADRESSE|CODE_POSTAL|FINESS|ETABLISSEMENT|MASK|IPP|"
r"DOSSIER|NDA|EPISODE|RPPS|DATE_NAISSANCE|AGE|NIR|IBAN|OGC)\])\]+"
)
final_text = _RE_BRACKET_CLEAN.sub(r"\1", final_text)
# Sauvegardes
base = pdf_path.stem
txt_path = out_dir / f"{base}.pseudonymise.txt"
audit_path = out_dir / f"{base}.audit.jsonl"
txt_path.write_text(final_text, encoding="utf-8")
# Filtrer les entrées de propagation globale (page=-1) avant d'écrire l'audit
# Ces entrées sont utilisées pour le remplacement dans le texte mais ne sont pas des détections réelles
audit_for_file = [hit for hit in anon.audit if hit.page != -1]
with audit_path.open("w", encoding="utf-8") as f:
for hit in audit_for_file:
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
outputs = {"text": str(txt_path), "audit": str(audit_path)}
# PDFs
if make_vector_redaction and fitz is not None:
vec_path = out_dir / f"{base}.redacted_vector.pdf"
try:
redact_pdf_vector(pdf_path, anon.audit, vec_path, ocr_word_map=ocr_word_map)
outputs["pdf_vector"] = str(vec_path)
except Exception:
pass
if also_make_raster_burn and fitz is not None:
ras_path = out_dir / f"{base}.redacted_raster.pdf"
redact_pdf_raster(pdf_path, anon.audit, ras_path, ogc_label=ogc_label, ocr_word_map=ocr_word_map)
outputs["pdf_raster"] = str(ras_path)
return outputs
def process_pdfs_batch(
pdf_paths: List[Path],
out_dir: Path,
max_workers: int = None,
**kwargs,
) -> List[Dict[str, str]]:
"""Traite plusieurs PDFs en parallèle (ProcessPoolExecutor).
Ne fonctionne que quand ner_manager=None (les modèles NER ne sont pas
picklables). Quand NER est actif, les PDFs restent séquentiels mais
bénéficient de la parallélisation page-level de redact_pdf_raster().
"""
if not pdf_paths:
return []
if max_workers is None:
max_workers = min(len(pdf_paths), os.cpu_count() or 4)
out_dir.mkdir(parents=True, exist_ok=True)
def _one(pdf_path):
return process_pdf(pdf_path, out_dir, **kwargs)
with ProcessPoolExecutor(max_workers=max_workers) as pool:
return list(pool.map(_one, pdf_paths))
if __name__ == "__main__":
import argparse
ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)")
ap.add_argument("pdf", type=str)
ap.add_argument("--out", type=str, default="out")
ap.add_argument("--no-vector", action="store_true")
ap.add_argument("--raster", action="store_true")
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
args = ap.parse_args()
manager = None
if args.hf and NerModelManager is not None:
manager = NerModelManager(cache_dir=Path("models"))
manager.load(args.model)
outs = process_pdf(
Path(args.pdf),
Path(args.out),
make_vector_redaction=not args.no_vector,
also_make_raster_burn=args.raster,
config_path=Path(args.config),
use_hf=bool(args.hf),
ner_manager=manager,
ner_thresholds=NerThresholds() if NerThresholds else None,
)
print(json.dumps(outs, indent=2, ensure_ascii=False))