- Page scannée entièrement noire (OGC 258) : les images couvrant > 70%
de la page ne sont plus noircies (document scanné ≠ logo/signature)
- Labels DPI "Nom [■] naissance" : tokens < 3 chars ("N", "S") exclus
du raster pour éviter les FP sur les mots courts des labels
- Stop-words enrichis : betascrub, hibiscrub, fresubin, nutrison,
résorbable, nombreuses, internationale, capsule, alfa, prothèses
- FINESS blacklist : "internationale", "international", "intercommunal"
- "classification [ETABLISSEMENT] de l'infection" → corrigé
Score évaluation maintenu à 100.0/100 (A+)
Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
4116 lines
192 KiB
Python
4116 lines
192 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement)
|
||
------------------------------------------------------------------------
|
||
- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx)
|
||
- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML
|
||
- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES]
|
||
- Redaction PDF (vector/raster) via PyMuPDF
|
||
- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif
|
||
|
||
Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime
|
||
"""
|
||
from __future__ import annotations
|
||
import io
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
from concurrent.futures import ProcessPoolExecutor
|
||
|
||
log = logging.getLogger(__name__)
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import List, Dict, Tuple, Optional, Any
|
||
|
||
# {page_idx: [(word_text, x0_norm, y0_norm, x1_norm, y1_norm), ...]}
|
||
# Coordonnées normalisées 0→1 (format natif docTR word.geometry)
|
||
OcrWordMap = Dict[int, List[Tuple[str, float, float, float, float]]]
|
||
|
||
import pdfplumber
|
||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||
from pdfminer.layout import LAParams
|
||
from PIL import Image, ImageDraw
|
||
|
||
try:
|
||
import fitz # PyMuPDF
|
||
except Exception:
|
||
fitz = None
|
||
|
||
try:
|
||
import yaml # PyYAML for dictionaries
|
||
except Exception:
|
||
yaml = None
|
||
|
||
try:
|
||
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
||
_DOCTR_AVAILABLE = True
|
||
except Exception:
|
||
_doctr_ocr_predictor = None # type: ignore
|
||
_DOCTR_AVAILABLE = False
|
||
|
||
try:
|
||
from detectors.hospital_filter import HospitalFilter
|
||
_HOSPITAL_FILTER_AVAILABLE = True
|
||
except Exception:
|
||
_HOSPITAL_FILTER_AVAILABLE = False
|
||
HospitalFilter = None # type: ignore
|
||
|
||
# NER manager (facultatif)
|
||
try:
|
||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||
except Exception:
|
||
NerModelManager = None # type: ignore
|
||
NerThresholds = None # type: ignore
|
||
|
||
# EDS-Pseudo manager (facultatif)
|
||
try:
|
||
from eds_pseudo_manager import EdsPseudoManager
|
||
except Exception:
|
||
EdsPseudoManager = None # type: ignore
|
||
|
||
# VLM manager (facultatif)
|
||
try:
|
||
from vlm_manager import VlmManager
|
||
except Exception:
|
||
VlmManager = None # type: ignore
|
||
|
||
|
||
def _load_edsnlp_drug_names() -> set:
|
||
"""Charge les noms de médicaments mono-mot depuis edsnlp/resources/drugs.json.
|
||
Retourne un set lowercase. Fallback silencieux si edsnlp absent."""
|
||
try:
|
||
import edsnlp as _edsnlp
|
||
drugs_path = _edsnlp.BASE_DIR / "resources" / "drugs.json"
|
||
if not drugs_path.exists():
|
||
return set()
|
||
import json as _json
|
||
data = _json.loads(drugs_path.read_text(encoding="utf-8"))
|
||
result = set()
|
||
for _code, names in data.items():
|
||
for name in names:
|
||
if " " not in name and len(name) >= 4:
|
||
result.add(name.lower())
|
||
return result
|
||
except Exception:
|
||
return set()
|
||
|
||
|
||
def _load_bdpm_medication_names() -> set:
|
||
"""Charge les noms de médicaments depuis la base BDPM (data/bdpm/medication_names.txt).
|
||
Retourne un set lowercase. ~5700 noms commerciaux et DCI."""
|
||
bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medication_names.txt"
|
||
if not bdpm_path.exists():
|
||
return set()
|
||
try:
|
||
names = set()
|
||
for line in bdpm_path.read_text(encoding="utf-8").splitlines():
|
||
w = line.strip()
|
||
if w and len(w) >= 3:
|
||
names.add(w.lower())
|
||
return names
|
||
except Exception:
|
||
return set()
|
||
|
||
|
||
# ----------------- Gazetteers INSEE (prénoms + communes) -----------------
|
||
_INSEE_PRENOMS: set = set()
|
||
_INSEE_COMMUNES: set = set()
|
||
|
||
def _load_insee_gazetteers():
|
||
"""Charge les gazetteers INSEE (prénoms français + communes)."""
|
||
global _INSEE_PRENOMS, _INSEE_COMMUNES
|
||
data_dir = Path(__file__).parent / "data" / "insee"
|
||
|
||
# Prénoms (lowercase, >= 3 chars)
|
||
prenoms_path = data_dir / "prenoms_france.txt"
|
||
if prenoms_path.exists():
|
||
try:
|
||
_INSEE_PRENOMS = {
|
||
line.strip().lower() for line in prenoms_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 3
|
||
}
|
||
log.info(f"Gazetteers INSEE prénoms: {len(_INSEE_PRENOMS)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement prénoms INSEE: {e}")
|
||
|
||
# Communes (uppercase, >= 3 chars)
|
||
communes_path = data_dir / "communes_france.txt"
|
||
if communes_path.exists():
|
||
try:
|
||
_INSEE_COMMUNES = {
|
||
line.strip().upper() for line in communes_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 3
|
||
}
|
||
log.info(f"Gazetteers INSEE communes: {len(_INSEE_COMMUNES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement communes INSEE: {e}")
|
||
|
||
_load_insee_gazetteers()
|
||
|
||
|
||
# ----------------- Gazetteer FINESS (établissements de santé) -----------------
|
||
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres
|
||
_FINESS_ETAB_NAMES: set = set() # noms d'établissements (lowercase)
|
||
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
|
||
_FINESS_VILLES: set = set() # villes FINESS (uppercase)
|
||
_FINESS_AC = None # Automate Aho-Corasick pour noms distinctifs
|
||
_FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms de voie)
|
||
_VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS)
|
||
|
||
# Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
|
||
_VILLE_BLACKLIST = {
|
||
# Directions / mots géographiques génériques
|
||
"SAINT", "NORD", "SUD", "EST", "OUEST",
|
||
"CENTRE", "SERVICE", "BOURG",
|
||
# Communes homonymes de mots courants français
|
||
"ORANGE", "TOURS", "NICE", "SENS", "VITRE",
|
||
"ROMANS", "MENTON", "SALON", "VIENNE",
|
||
"BREST", # trop court et ambigu
|
||
"HYERES", # proche de termes médicaux
|
||
"AGEN", "AUCH", "ALBI",
|
||
"BLOIS", "LAON", "LENS",
|
||
"GIEN", "GRAY",
|
||
"AIRE", "LURE", "SETE", "DOLE",
|
||
"VIRE", "LUNEL", "MURET", "MORET",
|
||
"COEUR", "FOIX", "GIVET",
|
||
"EVIAN", "MAURE", "MENDE",
|
||
"JOUE", "MEAUX", "REDON",
|
||
"CREIL", "CERGY",
|
||
# Communes de 4-5 lettres homonymes de mots très courants
|
||
"VERS", "MONT", "MARS", "PORT", "PONT", "FORT",
|
||
"BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY",
|
||
"VAUX", "VERT", "FAUX", "REZE",
|
||
"BILLE", "PLACE", "VILLE", "COURS", "GRAND",
|
||
"ROUGE", "RICHE", "NUITS", "SORE", "SARE",
|
||
"TRANS", "RANS", "MARSA",
|
||
# Mots courants français (6+ lettres) aussi communes
|
||
"CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES",
|
||
"MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE",
|
||
"SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS",
|
||
"PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON",
|
||
# Prénoms très courants (aussi communes)
|
||
"MARIE", "PIERRE", "JEAN", "PAUL", "ANNE",
|
||
# Expressions composées ambiguës (aussi communes INSEE)
|
||
"LONG", "RECY", "PLAN", "MARCHE", "SALLE",
|
||
"CONTRE", "MERE", "ONDRES", "VEBRE",
|
||
# Mots structurels / médicaux
|
||
"PARIS", # omniprésent, source de faux positifs
|
||
"FRANCE", "EUROPE",
|
||
# Termes ambigus (aussi communes INSEE) - trackare/DPI
|
||
"COURANT", # "Médecin courant" ≠ ville
|
||
# Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
|
||
"COU", "DOS", "SEIN", "BRAS",
|
||
}
|
||
|
||
try:
|
||
import ahocorasick as _ahocorasick
|
||
_AHO_AVAILABLE = True
|
||
except ImportError:
|
||
_ahocorasick = None
|
||
_AHO_AVAILABLE = False
|
||
|
||
def _normalize_for_matching(s: str) -> str:
|
||
"""Normalise pour matching gazetteer : lowercase, sans accents, espaces collapsés."""
|
||
import unicodedata
|
||
s = s.lower().strip()
|
||
s = unicodedata.normalize("NFD", s)
|
||
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
||
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
||
s = re.sub(r"\s+", " ", s).strip()
|
||
return s
|
||
|
||
|
||
def _load_finess_gazetteers():
|
||
"""Charge les gazetteers FINESS (établissements, numéros, téléphones, villes, Aho-Corasick)."""
|
||
global _FINESS_NUMBERS, _FINESS_ETAB_NAMES, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
|
||
data_dir = Path(__file__).parent / "data" / "finess"
|
||
|
||
# Numéros FINESS
|
||
finess_path = data_dir / "finess_numbers.txt"
|
||
if finess_path.exists():
|
||
try:
|
||
_FINESS_NUMBERS = {
|
||
line.strip() for line in finess_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip()
|
||
}
|
||
log.info(f"Gazetteer FINESS numéros: {len(_FINESS_NUMBERS)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS numéros: {e}")
|
||
|
||
# Noms d'établissements complets (pour debug/référence)
|
||
noms_path = data_dir / "etablissements_noms.txt"
|
||
if noms_path.exists():
|
||
try:
|
||
_FINESS_ETAB_NAMES = {
|
||
line.strip().lower() for line in noms_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 6
|
||
}
|
||
log.info(f"Gazetteer FINESS noms: {len(_FINESS_ETAB_NAMES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS noms: {e}")
|
||
|
||
# Noms distinctifs : chargement différé (Aho-Corasick construit au premier appel,
|
||
# car _MEDICAL_STOP_WORDS_SET n'est pas encore défini à ce stade du module)
|
||
|
||
# Villes FINESS
|
||
villes_path = data_dir / "villes_finess.txt"
|
||
if villes_path.exists():
|
||
try:
|
||
_FINESS_VILLES = {
|
||
line.strip() for line in villes_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 3
|
||
}
|
||
log.info(f"Gazetteer FINESS villes: {len(_FINESS_VILLES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS villes: {e}")
|
||
|
||
# Téléphones (pour validation)
|
||
tel_path = data_dir / "telephones.txt"
|
||
if tel_path.exists():
|
||
try:
|
||
_FINESS_TELEPHONES = {
|
||
line.strip() for line in tel_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip()
|
||
}
|
||
log.info(f"Gazetteer FINESS téléphones: {len(_FINESS_TELEPHONES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS téléphones: {e}")
|
||
|
||
_load_finess_gazetteers()
|
||
|
||
|
||
# ----------------- Whitelists Médicales -----------------
|
||
_MEDICAL_STRUCTURAL_TERMS = set()
|
||
_MEDICATION_WHITELIST = set()
|
||
|
||
def load_medical_whitelists():
|
||
"""Charge les whitelists médicales (termes structurels + médicaments)."""
|
||
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
|
||
|
||
# 1. Charger les termes médicaux structurels
|
||
config_path = Path("config/medical_terms_whitelist.yml")
|
||
if config_path.exists() and yaml:
|
||
try:
|
||
with open(config_path, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
terms = data.get('medical_structural_terms', [])
|
||
_MEDICAL_STRUCTURAL_TERMS = {t.lower() for t in terms}
|
||
log.info(f"Whitelist termes médicaux chargée: {len(_MEDICAL_STRUCTURAL_TERMS)} termes")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement whitelist médicale: {e}")
|
||
|
||
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
|
||
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
|
||
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
|
||
# Ajouter médicaments manquants
|
||
additional_meds = {
|
||
"idacio", "salazopyrine", "infliximab", "apranax",
|
||
"ketoprofene", "prevenar", "pneumovax", "bétadine"
|
||
}
|
||
_MEDICATION_WHITELIST.update(additional_meds)
|
||
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
|
||
|
||
# Charger les whitelists au démarrage du module
|
||
load_medical_whitelists()
|
||
|
||
|
||
# ----------------- Defaults & Config -----------------
|
||
DEFAULTS_CFG = {
|
||
"version": 1,
|
||
"encoding": "utf-8",
|
||
"normalization": "NFKC",
|
||
"whitelist": {
|
||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||
"org_gpe_keep": False,
|
||
},
|
||
"blacklist": {
|
||
"force_mask_terms": [],
|
||
"force_mask_regex": [],
|
||
},
|
||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||
"regex_overrides": [
|
||
{
|
||
"name": "OGC_court",
|
||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||
"placeholder": "[OGC]",
|
||
"flags": ["IGNORECASE"],
|
||
}
|
||
],
|
||
"flags": {
|
||
"case_insensitive": True,
|
||
"unicode_word_boundaries": True,
|
||
"regex_engine": "python",
|
||
},
|
||
}
|
||
|
||
PLACEHOLDERS = {
|
||
"EMAIL": "[EMAIL]",
|
||
"TEL": "[TEL]",
|
||
"IBAN": "[IBAN]",
|
||
"NIR": "[NIR]",
|
||
"IPP": "[IPP]",
|
||
"FINESS": "[FINESS]",
|
||
"OGC": "[OGC]",
|
||
"NOM": "[NOM]",
|
||
"VILLE": "[VILLE]",
|
||
"ETAB": "[ETABLISSEMENT]",
|
||
"MASK": "[MASK]",
|
||
"DATE": "[DATE]",
|
||
"DATE_NAISSANCE": "[DATE_NAISSANCE]",
|
||
"ADRESSE": "[ADRESSE]",
|
||
"CODE_POSTAL": "[CODE_POSTAL]",
|
||
"AGE": "[AGE]",
|
||
"DOSSIER": "[DOSSIER]",
|
||
"NDA": "[NDA]",
|
||
"EPISODE": "[EPISODE]",
|
||
"RPPS": "[RPPS]",
|
||
}
|
||
|
||
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
|
||
|
||
# Baseline regex
|
||
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||
RE_URL = re.compile(r"https?://[A-Za-z0-9._~:/?#\[\]@!$&'()*+,;=\-%]+", re.IGNORECASE)
|
||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?(?:\(0\))?\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
|
||
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
|
||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
|
||
RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
|
||
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
|
||
RE_NIR = re.compile(
|
||
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
def validate_nir(nir_raw: str) -> bool:
|
||
"""Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B)."""
|
||
digits_only = re.sub(r"\s+", "", nir_raw)
|
||
if len(digits_only) < 15:
|
||
return False
|
||
body_str = digits_only[:13]
|
||
key_str = digits_only[13:15]
|
||
# Corse : 2A → 19, 2B → 18 (pour le calcul)
|
||
body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18")
|
||
try:
|
||
body_int = int(body_str_calc)
|
||
key_int = int(key_str)
|
||
except ValueError:
|
||
return False
|
||
return key_int == (97 - (body_int % 97))
|
||
|
||
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes
|
||
_MEDICAL_STOP_WORDS_SET = {
|
||
# Mots français courants (déterminants, prépositions, adverbes, etc.)
|
||
"pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous",
|
||
"mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par",
|
||
"les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces",
|
||
"cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant",
|
||
"puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques",
|
||
"mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours",
|
||
"semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau",
|
||
"franche", "légère", "quelque", "depuis", "comme", "encore", "votre",
|
||
"date", "note", "notes", "nom", "heure", "matin", "soir", "midi",
|
||
"signé", "réalisé", "courrier", "cabinet", "rue",
|
||
# Verbes / participes courants
|
||
"remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
|
||
"prescrit", "prescrite", "présente", "présent", "absente", "absent",
|
||
"reprise", "introduction", "arrêt", "relais",
|
||
# Titres / rôles hospitaliers
|
||
"chef", "assistant", "assistante", "praticien", "praticienne",
|
||
"docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
|
||
"spécialiste", "contractuel", "contractuelle", "titulaire",
|
||
"confrère", "consoeur", "coordonnateur", "coordonnatrice",
|
||
"médecin", "médical", "infirmier", "infirmière",
|
||
"praticiens", "patient", "patiente",
|
||
# Structure hospitalière
|
||
"service", "pôle", "clinique", "consultation", "secrétariat",
|
||
"hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
|
||
# Villes / géographie (pas des noms de personnes)
|
||
"bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
|
||
"toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
|
||
"basque", "basques", "sud", "côte",
|
||
# Médicaments génériques et spécialités (DCI + noms commerciaux)
|
||
"colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
|
||
"methotrexate", "eplerenone", "speciafoldine", "prednisone",
|
||
"corticoïdes", "cortisone",
|
||
"paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
|
||
"lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
|
||
"insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
|
||
"ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
|
||
"morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
|
||
"seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
|
||
"bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
|
||
"quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
|
||
"opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
|
||
"laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
|
||
"terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
|
||
"lansoprazole", "perindopril", "sodium", "velmetia",
|
||
"doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
|
||
"augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
|
||
"ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
|
||
"irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
|
||
"spironolactone", "furosemide", "lasilix", "aldactone",
|
||
"tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
|
||
"xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
|
||
"plavix", "clopidogrel", "ticagrelor", "brilique",
|
||
"ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
|
||
"salbutamol", "tiotropium", "budesonide", "beclometasone",
|
||
"oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
|
||
"nubain", "nalbuphine", "nefopam", "acupan", "profenid",
|
||
"ibuprofene", "diclofenac", "naproxene", "celecoxib",
|
||
"gabapentine", "pregabaline", "lyrica", "neurontin",
|
||
"amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
|
||
"paroxetine", "escitalopram", "citalopram", "mirtazapine",
|
||
"olanzapine", "risperidone", "aripiprazole", "haloperidol",
|
||
"loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
|
||
"clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
|
||
"stilnox", "zolpidem", "imovane",
|
||
"levothyroxine", "metformine", "glimepiride", "sitagliptine",
|
||
"januvia", "jardiance", "empagliflozine", "dapagliflozine",
|
||
"ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
|
||
"heparine", "enoxaparine", "tinzaparine", "innohep",
|
||
"warfarine", "coumadine", "fluindione", "previscan",
|
||
"ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
|
||
"vancomycine", "gentamicine", "tazocilline", "piperacilline",
|
||
"meropenem", "imipenem", "clindamycine", "doxycycline",
|
||
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
|
||
"polyionique", "propranolol", "apidra", "solostar",
|
||
# Noms et suffixes laboratoires pharmaceutiques
|
||
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
|
||
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
|
||
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
|
||
"evolugen", "alter", "zydus", "medisol", "substipharm",
|
||
"sdz", "bgr", "egt", "rnb",
|
||
# Formes galéniques / voies d'administration
|
||
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
|
||
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
|
||
"unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
|
||
"orodisp", "capsule", "patch", "suppositoire", "gouttes",
|
||
# Termes de prescription / pharmacie
|
||
"prescription", "prescriptions", "dose", "fréquence", "statut",
|
||
"technique", "capteur", "bandelettes", "glycemiques", "glycemique",
|
||
"lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
|
||
"glycemie", "capillaire", "hgt",
|
||
# Termes médicaux / cliniques
|
||
"myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
|
||
"dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
|
||
"antécédents", "examen", "bilan", "résultats", "analyse",
|
||
"interne", "externe", "médecine", "chirurgie", "rhumatologie",
|
||
"dermatologie", "immunologie", "cardiologie", "pneumologie",
|
||
"neurologie", "gynécologie", "radiologie", "sénologie",
|
||
"douleur", "douleurs", "douloureux", "musculaire", "musculaires",
|
||
"thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
|
||
"normale", "normaux", "habituelle", "habituelles",
|
||
"synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
|
||
"pelvien", "diabétique", "sommeil", "régime", "diet",
|
||
"desinfection", "environnement", "identification", "bracelet",
|
||
"toilettes", "accompagner", "installer", "transfusion",
|
||
"signes", "vitaux", "alimentaire", "avis", "zone",
|
||
"calcémie",
|
||
# Abréviations médicales
|
||
"irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
|
||
"bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
|
||
"saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
|
||
"poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
|
||
"qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
|
||
"vit", "zen",
|
||
"scanner", "radio", "écho", "échographie",
|
||
# Spécialités médicales (éviter faux positifs NOM)
|
||
"hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
|
||
"proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
|
||
"cardiologue", "néphrologue", "urologue", "gériatre",
|
||
"hépatologue", "endocrinologue", "stomatologue",
|
||
# Termes médicaux / titres fréquemment détectés comme NOM par le NER
|
||
"supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
|
||
"suivi", "sortie", "emog", "ophtalmo",
|
||
# Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
|
||
"eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
|
||
"lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
|
||
"depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
|
||
"rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
|
||
"pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
|
||
# Termes médicaux / soins / actes détectés comme NOM
|
||
"partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
|
||
"diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
|
||
"ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
|
||
"diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
|
||
# Mots courants français détectés comme NOM dans les trackare
|
||
"toilette", "repas", "poche", "installation", "education", "éducation",
|
||
"refection", "réfection", "complete", "complète", "regime", "régime",
|
||
"normal", "traité", "traite", "arrêté", "arrete", "volume",
|
||
"commentaires", "france", "covid", "framboise", "epoux", "époux",
|
||
# Abréviations médicales courtes (3-4 chars) détectées comme NOM
|
||
"ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
|
||
"mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
|
||
"amox", "endoc", "microg", "item", "pyélo", "néphro",
|
||
# En-têtes de colonnes / mots structurels trackare
|
||
"observations", "observation", "commentaires", "commentaire",
|
||
"surveillance", "température", "temperature", "glycémie", "glycemie",
|
||
"diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
|
||
"saturation", "fréquence", "frequence", "respiratoire", "douleur",
|
||
"alertes", "alerte", "antécédents", "antecedents", "habitus",
|
||
"allergies", "prescriptions", "prescription", "administration",
|
||
"catégorie", "categorie", "expiration", "message",
|
||
"destination", "diagnostique", "diagnostiques",
|
||
"date", "note", "nom", "heure", "type", "code", "etat",
|
||
"comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
|
||
# Médicaments supplémentaires détectés dans les trackare
|
||
"depakote", "versatis", "humalog", "forxiga", "durogesic",
|
||
"montelukast", "rosuvastatine",
|
||
# Abréviations pharma courtes
|
||
"cpr", "sol", "bic", "agt", "poche", "inhal",
|
||
# Termes chirurgicaux/cliniques FP
|
||
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
|
||
"gauche", "droit", "droite", "face", "profil",
|
||
# Faux positifs EDS supplémentaires
|
||
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
|
||
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
|
||
"10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
|
||
"actiskenan", "simvastatine", "forlax",
|
||
# Mots temporels / contextuels détectés comme EDS_HOPITAL
|
||
"semaine", "jour", "matin", "soir", "nuit", "midi",
|
||
# Mots clés de contexte document
|
||
"compétences", "maladies", "inflammatoires", "systémiques", "rares",
|
||
"fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
|
||
"haute", "maison", "aide", "rpps", "poste", "fonct",
|
||
"sante", "santé", "etxe", "ttipi", "gastro", "concha",
|
||
"endoscopie", "endoscopique", "fibroscopie",
|
||
"indication", "conclusion", "technique", "anesthésie",
|
||
"digestif", "digestive", "digestives", "nutritive",
|
||
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
|
||
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
|
||
# Verbes d'instructions soins (aussi des patronymes INSEE → FP)
|
||
"coucher", "manger", "marcher", "sortir",
|
||
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
|
||
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
|
||
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
|
||
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
|
||
"paracetamol", "paracétamol", "unité", "unite",
|
||
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
|
||
"glyc", "glycosurie", "vider", "forte",
|
||
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
|
||
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
|
||
"responsable", "autre", "autres", "autonome", "autonomes",
|
||
"préparations", "preparations", "prévenir", "prevenir",
|
||
"acétylsalicylique", "acetylsalicylique", "angio",
|
||
"desc", "diu", "barreau",
|
||
"haitz", "alde",
|
||
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
|
||
"alimentation", "augmentation", "amelioration", "amélioration",
|
||
"biliaire", "biliaires", "bili", "voies", "voie",
|
||
"apyrexie", "apyréxie", "apyrétique", "apyretique",
|
||
"clavulanique", "mecillinam", "sulfamides", "sulfamide",
|
||
"tazobactam", "temocilline", "ecoflac", "furanes", "furane",
|
||
"exilar", "lipruzet", "mopral",
|
||
"sensible", "sensibles", "dossier", "dossiers",
|
||
"entero", "entéro", "medecine", "bio",
|
||
"aviation", "contention", "isolement",
|
||
"elimination", "élimination", "infectieux",
|
||
"hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
|
||
"cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
|
||
"appendicectomie", "néoplasie", "neoplasie",
|
||
"ovarienne", "prandial", "fébrile", "febrile",
|
||
"eupnéique", "eupneique", "normocarde", "normotendue",
|
||
"variable", "dosage", "posologie",
|
||
# Abréviations diététiques/soins trackare
|
||
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
|
||
# FP audit OGC 17 CRH
|
||
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
|
||
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
|
||
"saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
|
||
# Spécialités/services récurrents comme FP NOM
|
||
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
||
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
||
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
|
||
"ophtalmologie", "stomatologie", "allergologie",
|
||
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
|
||
"orthopédie", "orthopedie", "traumatologie",
|
||
"palliatifs", "palliative", "palliatif",
|
||
"addictologie", "alcoologie", "tabacologie",
|
||
# FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
|
||
"discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
|
||
"evendol", "relais", "repas", "poursuite", "indication",
|
||
# FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
|
||
"eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
|
||
"thermie", "alim", "alimentation", "admin",
|
||
# Médicaments/tests labo capturés par patterns soignants
|
||
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
|
||
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
|
||
"ciprofloxacine", "lavement", "desinfection", "désinfection",
|
||
"avaler", "rachis", "lombaire", "thoraco-lombaire",
|
||
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
|
||
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
|
||
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
|
||
"faible", "fort", "forte",
|
||
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
|
||
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
|
||
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
|
||
"entree", "entrée", "continu", "continue",
|
||
"morphine", "claforan", "skenan", "actiskenan",
|
||
# Fragments de noms de médicaments (pdfplumber split)
|
||
"sium", "pegic", "fenid", "profenid",
|
||
# Catégories cliniques Trackare (en-têtes de section masqués à tort)
|
||
"respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
|
||
"hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
|
||
"transit", "anemie", "anémie", "constantes", "examen",
|
||
"post-op", "postop", "pré-op", "preop", "chimio", "elim",
|
||
"toilette", "sommeil", "hypota", "hypotension", "spo2",
|
||
"urine", "urines", "sng",
|
||
"rénale", "renale", "rénal", "renal", "cardiaque",
|
||
# Termes structurels trackare
|
||
"transmissions", "transmission", "releve", "relevé",
|
||
"objectif", "objectifs", "evaluation", "évaluation",
|
||
"planification", "planifié", "planifiee",
|
||
# ── FP détectés automatiquement par audit_fp_detector.py ──
|
||
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
|
||
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
|
||
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
|
||
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
|
||
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
|
||
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
|
||
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
|
||
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
|
||
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
|
||
"vernis", "vessie", "vrac",
|
||
# Lot 2 : termes médicaux (préfixes/suffixes)
|
||
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
|
||
"cétonurie", "cetonurie", "depilation", "dépilation",
|
||
"folique", "gastroentérologue", "gastroenterologue",
|
||
"microgrammes", "nalidixique", "naso-gastrique",
|
||
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
|
||
"cyto", "plaie-colle", "bionolyte",
|
||
# Lot 1 (103 tokens, confiance >= 0.5) ──
|
||
# Anatomie / clinique
|
||
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
|
||
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
|
||
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
|
||
# Pathologies / symptômes
|
||
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
|
||
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
|
||
# Médicaments / matériel médical
|
||
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
|
||
"oxygène", "pansement", "vitamine",
|
||
# Biologie / examens
|
||
"biochimie", "biologie", "fer",
|
||
# Actions / états cliniques
|
||
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
|
||
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
|
||
"intervention", "position", "rappel", "relation", "retour", "réalisation",
|
||
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
|
||
"urgent", "validation",
|
||
# Mots courants / contextuels
|
||
"angle", "bille", "boisson", "bureau", "cases", "circuit",
|
||
"concubin", "confortable", "demain", "densité", "dernière",
|
||
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
|
||
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
|
||
"personne", "premier", "quartier", "retraite", "route", "rés",
|
||
"trouve", "verrouillé", "villa", "étage",
|
||
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
|
||
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
|
||
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
|
||
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
|
||
"maternité", "orale", "sachet", "absence",
|
||
# FP audit 30 fichiers Phase 2 (mars 2026)
|
||
"bouffee", "bouffée", "discontinue", "respimat", "lyoc",
|
||
"probnp", "pro-bnp", "nt-probnp",
|
||
"bpco", "colle", "gsc", "masse",
|
||
"selle", "selles",
|
||
# Acronymes médicaux courts (3 lettres) souvent FP comme NOM
|
||
"epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc",
|
||
"imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr",
|
||
"hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm",
|
||
"vni", "aeg", "bas", "snv", "hba", "ide", "dci",
|
||
# Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026)
|
||
"buvable", "buvables", "nominal", "nominaux",
|
||
"acide", "principale", "principal", "principaux",
|
||
"hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique",
|
||
"clavulanique", "nalidixique",
|
||
"grancher", # Centre de réadaptation (nom d'établissement homonyme)
|
||
"experf", # Prestataire HAD (nom commercial homonyme)
|
||
# Noms de services hospitaliers (FP comme [NOM])
|
||
"ortho", "mobile", "polyvalente", "polyvalent",
|
||
"geriatrie", "gériatrie", "ambulatoire", "provisoire",
|
||
"intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané",
|
||
# Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents)
|
||
"viscerale", "viscérale", "vasculaire", "vasculaires",
|
||
"conventionnelle", "conventionnel",
|
||
"polyvalente", "polyvalent",
|
||
"infectieuse", "infectieuses",
|
||
# Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216)
|
||
"aide", "partielle", "partiel", "complete", "complète", "complet",
|
||
"contention", "lavabo", "blader", "scan", "post", "lunettes",
|
||
"deshabillage", "déshabillage", "habillage",
|
||
"surveillance", "surv", "refection", "réfection",
|
||
"miction", "toilette", "douche", "changes",
|
||
"installation", "transfert", "mobilisation",
|
||
"alimentation", "hydratation", "collation",
|
||
"stimulation", "prevention", "prévention",
|
||
# Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16)
|
||
"chlorure",
|
||
# Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
|
||
"canule", "canules", "masque", "sonde", "sondes",
|
||
# Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
|
||
"totale", "total", "partielle", "partiel",
|
||
"prothese", "prothèse", "protheses", "prothèses", "unicompartimentale",
|
||
# Antiseptiques / produits de soins (FP trackare prescriptions)
|
||
"betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine",
|
||
# Nutrition entérale / compléments
|
||
"fresubin", "nutrison", "sondalis", "isosource", "novasource",
|
||
# Termes médicaux FP dans bactério / texte libre
|
||
"nombreuses", "nombreux", "plusieurs", "quelques",
|
||
"internationale", "international",
|
||
"resorbable", "résorbable", "resorbables", "résorbables",
|
||
"alfa", "capsule", "capsules",
|
||
}
|
||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||
|
||
# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives
|
||
_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt"
|
||
if _bdpm_path.exists():
|
||
_bdpm_count = 0
|
||
for _line in _bdpm_path.read_text(encoding="utf-8").splitlines():
|
||
_w = _line.strip()
|
||
if _w and not _w.startswith("#"):
|
||
_MEDICAL_STOP_WORDS_SET.add(_w)
|
||
_bdpm_count += 1
|
||
log.info("BDPM stop-words chargés : %d mots", _bdpm_count)
|
||
|
||
_MEDICAL_STOP_WORDS = (
|
||
r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
|
||
)
|
||
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
||
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||
RE_PERSON_CONTEXT = re.compile(
|
||
r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
|
||
r"|\bNom[ \t]*:[ \t]*"
|
||
r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
|
||
r")[ \t]+)"
|
||
rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
|
||
)
|
||
|
||
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
|
||
RE_DR_COMMA_LIST = re.compile(
|
||
r"(?:Dr\.?|DR\.?|Docteur)\s+"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+"
|
||
r"(?:\s*,\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+)+",
|
||
re.IGNORECASE,
|
||
)
|
||
# Token nom : mot commençant par une majuscule d'au moins 3 lettres
|
||
_NAME_TOKEN_RE = re.compile(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']{2,}")
|
||
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
|
||
|
||
# --- Extraction globale de noms depuis champs structurés ---
|
||
_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||
RE_EXTRACT_PATIENT = re.compile(
|
||
r"Patient\(?e?\)?\s*:\s*"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)"
|
||
r"(?=\s+Né|\s+né|\s+N°|\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
# Champs d'identité structurés (documents trackare / DPI)
|
||
RE_EXTRACT_NOM_NAISSANCE = re.compile(
|
||
r"Nom\s+de\s+naissance\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+IPP|\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_NOM_PRENOM = re.compile(
|
||
r"Nom\s+et\s+Pr[ée]nom\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+Date|\s+Né|\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_NOM_UTILISE = re.compile(
|
||
r"Nom\s+utilis[ée]\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_PRENOM = re.compile(
|
||
r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_LIEU_NAISSANCE = re.compile(
|
||
r"Lieu\s+de\s+naissance\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_VILLE_RESIDENCE = re.compile(
|
||
r"Ville\s+de\s+r[ée]sidence\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
# Contacts structurés : Conjoint/Concubin/Epoux/Epouse/Parent + NOM PRENOM
|
||
RE_EXTRACT_CONTACT = re.compile(
|
||
r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur)\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
|
||
)
|
||
RE_EXTRACT_REDIGE = re.compile(
|
||
r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
)
|
||
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
|
||
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
|
||
RE_EXTRACT_MME_MR = re.compile(
|
||
r"(?:MMES|MME|Mmes|Mme|Madame|Mesdames|Monsieur|Messieurs|Mrs|Mr\.?)\s+"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,4}})",
|
||
)
|
||
# Listes virgulées après civilité : "Mmes Anorga, Goyenaga, Martinez et Murcy"
|
||
_CNAME = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']"
|
||
RE_CIVILITE_COMMA_LIST = re.compile(
|
||
r"(?:Mmes|Mme|Mesdames|Mrs|Mr|Messieurs|Monsieur|Madame|Dr\.?|Docteur)\s+"
|
||
+ _CNAME + r"+"
|
||
+ r"(?:\s*,\s*" + _CNAME + r"+)*"
|
||
+ r"(?:\s*,?\s*\bet\s+" + _CNAME + r"+)?",
|
||
re.IGNORECASE,
|
||
)
|
||
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
||
RE_EXTRACT_DR_DEST = re.compile(
|
||
r"\b(?:DR\.?|Dr\.?|Docteur)[ \t]+"
|
||
+ _INITIAL_OPT +
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
)
|
||
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
|
||
RE_EXTRACT_STAFF_ROLE = re.compile(
|
||
r"\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier"
|
||
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b[ \t]*:?[ \t]*"
|
||
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
|
||
)
|
||
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
|
||
RE_EXTRACT_PR = re.compile(
|
||
r"(?:Pr\.?|Professeur)[ \t]+"
|
||
+ _INITIAL_OPT +
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
)
|
||
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
|
||
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
|
||
RE_EXTRACT_OPERATEUR = re.compile(
|
||
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
|
||
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
|
||
+ _INITIAL_OPT +
|
||
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
|
||
)
|
||
# En-tête "Courrier Epi - NOM, PRENOM" (lettres de sortie)
|
||
RE_EXTRACT_COURRIER = re.compile(
|
||
r"Courrier\s+(?:Epi|Ep[ée]ph[ée]m[eé]ride|Hospit)\s*[\-–]\s*"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:\s*,\s*(?:{_UC_NAME_TOKEN}))*)",
|
||
)
|
||
# "CABINET ETXEBARNONDOA", "Cabinet Médical DUPONT", "CABINET MEDICAL DU DR MACHIN"
|
||
RE_EXTRACT_CABINET = re.compile(
|
||
r"\bCABINET\s+(?:M[ÉEe]DICAL\s+)?(?:DU\s+)?(?:DR\.?\s+)?"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
re.IGNORECASE,
|
||
)
|
||
# Téléphone avec extension slash : 05.59.44.38.32/34
|
||
RE_TEL_SLASH = re.compile(
|
||
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
|
||
)
|
||
|
||
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||
|
||
# --- Mr/Mme + initiale isolée (ex: "Mme Z", "Mr R") ---
|
||
RE_CIVILITE_INITIALE = re.compile(
|
||
r"\b((?:Mme|MME|Madame|Monsieur|Mr\.?|M\.)\s+)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])(?=[\s,.\-\)\]:;!?]|$)"
|
||
)
|
||
|
||
# --- N° examen / N° patient imagerie (radiologie) ---
|
||
RE_NUM_EXAMEN_PATIENT = re.compile(
|
||
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# --- Adresses lieu-dit / maison basque / lotissement ---
|
||
RE_ADRESSE_LIEU_DIT = re.compile(
|
||
r"\b(?:MAISON|LOT|LOTISSEMENT|RESIDENCE|RÉSIDENCE|MAS|LIEU[\s\-]DIT|DOMAINE|HAMEAU|QUARTIER)\s+"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']{2,}"
|
||
r"(?:\s+\d{1,4})?",
|
||
re.IGNORECASE,
|
||
)
|
||
# Lieux-dits courants en France (sur ligne seule = adresse)
|
||
RE_LIEU_DIT_SEUL = re.compile(
|
||
r"^[ \t]*(Le\s+Bourg|Le\s+Village|Le\s+Hameau|Le\s+Château|Le\s+Moulin|La\s+Place|Le\s+Clos)[ \t]*$",
|
||
re.IGNORECASE | re.MULTILINE,
|
||
)
|
||
|
||
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
||
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
||
RE_DATE_NAISSANCE = re.compile(
|
||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_DATE = re.compile(
|
||
r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b"
|
||
r"|"
|
||
r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_ADRESSE = re.compile(
|
||
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
|
||
r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence"
|
||
r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)"
|
||
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_CODE_POSTAL = re.compile(
|
||
r"(?:(?:[Cc]ode\s*[Pp]ostal|CP)\s*[:\-]?\s*(\d{5}))"
|
||
r"|"
|
||
# 5 chiffres + nom de ville (Title Case ou MAJUSCULES), pas précédé d'un chiffre (évite RPPS)
|
||
# Exclure les unités médicales (UI, mg, ml, etc.) via negative lookahead
|
||
r"(?:(?<!\d)(\d{5})[ \t]+(?!UI\b|mg\b|ml\b|µg\b)[A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+"
|
||
r"(?:[\s\-][A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+)*"
|
||
r"(?:\s+CEDEX)?)",
|
||
)
|
||
RE_BP = re.compile(
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\.\-]+\s+)?BP\s+\d+",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_AGE = re.compile(
|
||
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+(?:de\s+)?|(?:,\s*|\(\s*)"
|
||
r")(\d{1,3})\s*(?:ans|A)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
|
||
_ETAB_NAME = (r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)")
|
||
RE_ETABLISSEMENT = re.compile(
|
||
r"\b("
|
||
# Sigles longs : acceptés seuls ou avec nom
|
||
r"(?:EHPAD|SSR/USLD|SSR|USLD|HAD|CSAPA|CMPP|CMP|UGA|CHRU|CHU|HIA|CLCC|GHT|GCS)"
|
||
+ _ETAB_NAME + r"*"
|
||
r"|"
|
||
# Sigles courts (CH, CHS) : obligent un nom après pour éviter les faux positifs
|
||
r"(?:CHS|CH)" + _ETAB_NAME + r"+"
|
||
r")",
|
||
)
|
||
RE_HOPITAL_VILLE = re.compile(
|
||
r"(?<![Ee]xamen )"
|
||
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
|
||
r"|[Cc]entre\s+[Mm][ée]dical|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
|
||
r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer|[Pp]harmacie)"
|
||
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||
)
|
||
RE_SERVICE = re.compile(
|
||
r"\b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement)\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||
)
|
||
RE_NUMERO_DOSSIER = re.compile(
|
||
r"(?:\bdossier|\bn°\s*dossier|\bNDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||
r"|"
|
||
r"(?:\bréférence|\bréf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_EPISODE = re.compile(
|
||
r"N°\s*[ÉéEe]pisode\s*[:\-]?\s*([A-Za-z0-9\-]{4,})"
|
||
r"|"
|
||
r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})",
|
||
re.IGNORECASE,
|
||
)
|
||
# N° venue (BACTERIO), N° séjour — identifiants de séjour hospitalier
|
||
RE_VENUE_SEJOUR = re.compile(
|
||
r"(?:N[°o]?\s*venue|N[°o]?\s*séjour|N[°o]?\s*de\s+séjour"
|
||
r"|num[ée]ro\s+de\s+(?:venue|séjour))\s*[:\-]?\s*(\d{4,})",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
@dataclass
|
||
class PiiHit:
|
||
page: int
|
||
kind: str
|
||
original: str
|
||
placeholder: str
|
||
bbox_hint: Optional[Tuple[float, float, float, float]] = None
|
||
|
||
@dataclass
|
||
class AnonResult:
|
||
text_out: str
|
||
tables_block: str
|
||
audit: List[PiiHit] = field(default_factory=list)
|
||
is_trackare: bool = False
|
||
|
||
# ----------------- Config loader -----------------
|
||
|
||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||
cfg = DEFAULTS_CFG.copy()
|
||
if config_path and config_path.exists() and yaml is not None:
|
||
try:
|
||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||
for k, v in user.items():
|
||
cfg[k] = v
|
||
except Exception:
|
||
pass
|
||
return cfg
|
||
|
||
# ----------------- Extraction -----------------
|
||
|
||
_doctr_model_cache = None
|
||
|
||
def _get_doctr_model():
|
||
global _doctr_model_cache
|
||
if _doctr_model_cache is None:
|
||
_doctr_model_cache = _doctr_ocr_predictor(
|
||
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True
|
||
)
|
||
return _doctr_model_cache
|
||
|
||
def _extract_page_layout_aware(page) -> str:
|
||
"""Extrait le texte d'une page PyMuPDF en gérant les layouts multi-colonnes.
|
||
|
||
Détecte si la page a un sidebar/colonne gauche parallèle à un corps droit
|
||
(typique des CRH/CRO hospitaliers). Si oui, lit chaque colonne séparément
|
||
pour éviter l'entrelacement du texte.
|
||
"""
|
||
blocks = page.get_text("blocks")
|
||
text_blocks = []
|
||
for b in blocks:
|
||
x0, y0, x1, y1, text, _block_no, block_type = b
|
||
if block_type == 0 and text.strip():
|
||
text_blocks.append((x0, y0, x1, y1, text.strip()))
|
||
if not text_blocks:
|
||
return ""
|
||
|
||
page_w = page.rect.width
|
||
page_h = page.rect.height
|
||
|
||
# --- Détection de colonnes ---
|
||
# Cherche une ligne verticale split_x qui sépare les blocs en deux groupes
|
||
# parallèles (chevauchement vertical significatif).
|
||
best_split = None
|
||
best_score = -1
|
||
for split_x in range(int(page_w * 0.15), int(page_w * 0.45), 3):
|
||
left = [b for b in text_blocks if b[2] <= split_x + 5]
|
||
right = [b for b in text_blocks if b[0] >= split_x - 5]
|
||
crossing = [b for b in text_blocks if b[0] < split_x - 5 and b[2] > split_x + 5]
|
||
if len(left) < 3 or len(right) < 3:
|
||
continue
|
||
left_span = max(b[3] for b in left) - min(b[1] for b in left)
|
||
right_span = max(b[3] for b in right) - min(b[1] for b in right)
|
||
if left_span < page_h * 0.25 or right_span < page_h * 0.25:
|
||
continue
|
||
overlap_min = max(min(b[1] for b in left), min(b[1] for b in right))
|
||
overlap_max = min(max(b[3] for b in left), max(b[3] for b in right))
|
||
if overlap_max - overlap_min < page_h * 0.15:
|
||
continue
|
||
score = len(left) + len(right) - 5 * len(crossing)
|
||
if score > best_score:
|
||
best_score = score
|
||
best_split = split_x
|
||
|
||
if best_split is not None:
|
||
left_blocks = sorted(
|
||
[b for b in text_blocks if b[2] <= best_split + 5], key=lambda b: b[1]
|
||
)
|
||
right_blocks = sorted(
|
||
[b for b in text_blocks if b[0] >= best_split - 5], key=lambda b: b[1]
|
||
)
|
||
full_width = sorted(
|
||
[b for b in text_blocks if b[0] < best_split - 5 and b[2] > best_split + 5],
|
||
key=lambda b: b[1],
|
||
)
|
||
col_start_y = min(
|
||
min((b[1] for b in left_blocks), default=page_h),
|
||
min((b[1] for b in right_blocks), default=page_h),
|
||
)
|
||
headers = [b for b in full_width if b[1] < col_start_y + 5]
|
||
footers = [b for b in full_width if b[1] >= col_start_y + 5]
|
||
parts = []
|
||
for b in headers:
|
||
parts.append(b[4])
|
||
for b in left_blocks:
|
||
parts.append(b[4])
|
||
for b in right_blocks:
|
||
parts.append(b[4])
|
||
for b in footers:
|
||
parts.append(b[4])
|
||
return "\n".join(parts)
|
||
else:
|
||
sorted_blocks = sorted(text_blocks, key=lambda b: (b[1], b[0]))
|
||
return "\n".join(b[4] for b in sorted_blocks)
|
||
|
||
|
||
def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool, OcrWordMap]:
|
||
"""Extraction texte multi-passes avec fallback OCR (docTR).
|
||
Retourne (pages_text, tables_lines, ocr_used, ocr_word_map).
|
||
|
||
Passe 1 : PyMuPDF layout-aware (blocs avec détection de colonnes)
|
||
Passe 1b: pdfplumber si PyMuPDF échoue ou donne peu de texte
|
||
Passe 2 : pdfminer si CID ou texte pauvre
|
||
Passe 3 : OCR docTR si PDF scanné (très peu de texte)
|
||
Tables : toujours extraites via pdfplumber (indépendamment du texte).
|
||
"""
|
||
pages_text: List[str] = []
|
||
tables_lines: List[List[str]] = []
|
||
ocr_used = False
|
||
|
||
# --- Tables : toujours via pdfplumber ---
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
for p in pdf.pages:
|
||
rows: List[str] = []
|
||
try:
|
||
tables = p.extract_tables()
|
||
for tbl in tables or []:
|
||
for row in tbl:
|
||
clean = [c if c is not None else "" for c in row]
|
||
rows.append("\t".join(clean).strip())
|
||
except Exception:
|
||
pass
|
||
tables_lines.append(rows)
|
||
|
||
# --- Passe 1 : PyMuPDF layout-aware (détection multi-colonnes) ---
|
||
if fitz is not None:
|
||
try:
|
||
doc = fitz.open(str(pdf_path))
|
||
pages_text = [_extract_page_layout_aware(doc[i]) for i in range(len(doc))]
|
||
doc.close()
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Passe 1b : pdfplumber si PyMuPDF n'a rien donné ---
|
||
total_chars = sum(len(x or "") for x in pages_text)
|
||
if total_chars < 500:
|
||
try:
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
pp_pages = [p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or "" for p in pdf.pages]
|
||
if sum(len(x) for x in pp_pages) > total_chars:
|
||
pages_text = pp_pages
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Passe 2 : pdfminer si CID ou texte pauvre ---
|
||
total_chars = sum(len(x or "") for x in pages_text)
|
||
need_fallback = total_chars < 500
|
||
if not need_fallback:
|
||
need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text)
|
||
if need_fallback:
|
||
try:
|
||
text_all = pdfminer_extract_text(
|
||
str(pdf_path),
|
||
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
|
||
)
|
||
split = [x for x in text_all.split("\f") if x]
|
||
if split and sum(len(x) for x in split) > total_chars:
|
||
pages_text = split
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Passe 3 : OCR docTR sur les pages pauvres en texte ---
|
||
# Pas de seuil global : on OCR uniquement les pages individuelles
|
||
# qui ont peu de texte (< 150 chars), puis on garde le meilleur résultat
|
||
# par page. Les pages déjà riches en texte ne sont pas touchées.
|
||
_OCR_PAGE_THRESHOLD = 150 # chars minimum pour considérer une page comme "texte OK"
|
||
total_chars = sum(len(x or "") for x in pages_text)
|
||
ocr_word_map: OcrWordMap = {}
|
||
sparse_pages = [i for i, p in enumerate(pages_text) if len(p or "") < _OCR_PAGE_THRESHOLD]
|
||
if sparse_pages and _DOCTR_AVAILABLE and fitz is not None:
|
||
try:
|
||
model = _get_doctr_model()
|
||
doc = fitz.open(str(pdf_path))
|
||
import numpy as np
|
||
ocr_replaced = 0
|
||
for i in sparse_pages:
|
||
if i >= len(doc):
|
||
continue
|
||
pix = doc[i].get_pixmap(dpi=300)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
result = model([np.array(img)])
|
||
page_text = ""
|
||
page_words: List[Tuple[str, float, float, float, float]] = []
|
||
for block in result.pages[0].blocks:
|
||
for line in block.lines:
|
||
for w in line.words:
|
||
(x0, y0), (x1, y1) = w.geometry
|
||
page_words.append((w.value, x0, y0, x1, y1))
|
||
page_text += " ".join(w.value for w in line.words) + "\n"
|
||
# Remplacer seulement si l'OCR produit plus de texte
|
||
if len(page_text) > len(pages_text[i] or ""):
|
||
pages_text[i] = page_text
|
||
ocr_word_map[i] = page_words
|
||
ocr_replaced += 1
|
||
doc.close()
|
||
if ocr_replaced > 0:
|
||
ocr_used = True
|
||
log.info("OCR docTR : %d/%d pages remplacées", ocr_replaced, len(sparse_pages))
|
||
except Exception as e:
|
||
log.warning("OCR docTR échoué : %s", e)
|
||
ocr_word_map = {}
|
||
return pages_text, tables_lines, ocr_used, ocr_word_map
|
||
|
||
|
||
# Alias pour compatibilité ascendante
|
||
def extract_text_three_passes(pdf_path: Path):
|
||
pages_text, tables_lines, _, _ = extract_text_with_fallback_ocr(pdf_path)
|
||
return pages_text, tables_lines
|
||
|
||
# ----------------- Helpers -----------------
|
||
|
||
def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||
flags = 0
|
||
for f in flags_list or []:
|
||
u = f.upper()
|
||
if u == "IGNORECASE": flags |= re.IGNORECASE
|
||
if u == "MULTILINE": flags |= re.MULTILINE
|
||
if u == "DOTALL": flags |= re.DOTALL
|
||
return re.compile(pattern, flags)
|
||
|
||
|
||
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||
for ov in cfg.get("regex_overrides", []) or []:
|
||
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||
flags_list = ov.get("flags", [])
|
||
try:
|
||
rx = _compile_user_regex(pattern, flags_list)
|
||
except Exception:
|
||
continue
|
||
def _rep(m: re.Match):
|
||
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
|
||
return placeholder
|
||
line = rx.sub(_rep, line)
|
||
# force-mask literals
|
||
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
|
||
if not term: continue
|
||
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
|
||
if word_rx.search(line):
|
||
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
|
||
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
|
||
# force-mask regex
|
||
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
|
||
try:
|
||
rx = re.compile(pat, re.IGNORECASE)
|
||
except Exception:
|
||
continue
|
||
def _repl_force_regex(m: re.Match, _pat=pat):
|
||
audit.append(PiiHit(page_idx, "force_regex", m.group(0), PLACEHOLDERS["MASK"]))
|
||
return PLACEHOLDERS["MASK"]
|
||
line = rx.sub(_repl_force_regex, line)
|
||
return line
|
||
|
||
|
||
RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")
|
||
|
||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||
m = RE_FINESS.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||
|
||
# Détection FINESS par gazetteer : nombre 9 chiffres qui matche un vrai numéro FINESS
|
||
if _FINESS_NUMBERS:
|
||
for m9 in RE_BARE_9DIGITS.finditer(line):
|
||
if m9.group(1) in _FINESS_NUMBERS:
|
||
val = m9.group(1)
|
||
audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||
line = line.replace(val, PLACEHOLDERS["FINESS"], 1)
|
||
return line
|
||
|
||
m = RE_OGC.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||
m = RE_IPP.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||
m = RE_CSULT.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
|
||
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
|
||
m = RE_RPPS.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
|
||
return RE_RPPS.sub(lambda _: f"RPPS : {PLACEHOLDERS['RPPS']}", line)
|
||
return line
|
||
|
||
|
||
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||
# user overrides & force-masks d'abord
|
||
line = _apply_overrides(line, audit, page_idx, cfg)
|
||
|
||
# EMAIL
|
||
def _repl_email(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||
return PLACEHOLDERS["EMAIL"]
|
||
line = RE_EMAIL.sub(_repl_email, line)
|
||
|
||
# URLs (toutes — peuvent identifier établissements, personnes, services)
|
||
def _repl_url(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "URL", m.group(0), PLACEHOLDERS["MASK"]))
|
||
return PLACEHOLDERS["MASK"]
|
||
line = RE_URL.sub(_repl_url, line)
|
||
# URL sans protocole (www.xxx.fr)
|
||
_re_url_www = re.compile(r"(?<!\S)www\.[a-z0-9\-]+\.(?:fr|com|org|net|eu)(?:/[^\s]*)?", re.IGNORECASE)
|
||
line = _re_url_www.sub(_repl_url, line)
|
||
|
||
# TEL
|
||
def _repl_tel(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"]
|
||
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
|
||
line = RE_TEL.sub(_repl_tel, line)
|
||
line = RE_TEL_COMPACT.sub(_repl_tel, line)
|
||
|
||
# IBAN
|
||
def _repl_iban(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
|
||
return PLACEHOLDERS["IBAN"]
|
||
line = RE_IBAN.sub(_repl_iban, line)
|
||
|
||
# NIR (avec validation clé modulo 97)
|
||
def _repl_nir(m: re.Match) -> str:
|
||
raw = m.group(0)
|
||
if not validate_nir(raw):
|
||
return raw # faux positif, on ne masque pas
|
||
audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"]))
|
||
return PLACEHOLDERS["NIR"]
|
||
line = RE_NIR.sub(_repl_nir, line)
|
||
|
||
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
|
||
def _repl_date_naissance(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||
return PLACEHOLDERS["DATE_NAISSANCE"]
|
||
line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line)
|
||
|
||
# DATE générique — désactivé : seules les dates de naissance sont masquées
|
||
# def _repl_date(m: re.Match) -> str:
|
||
# audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"]))
|
||
# return PLACEHOLDERS["DATE"]
|
||
# line = RE_DATE.sub(_repl_date, line)
|
||
|
||
# ADRESSE
|
||
def _repl_adresse(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
line = RE_ADRESSE.sub(_repl_adresse, line)
|
||
|
||
# BOITE POSTALE (BP)
|
||
def _repl_bp(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
line = RE_BP.sub(_repl_bp, line)
|
||
|
||
# CODE_POSTAL
|
||
def _repl_code_postal(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
|
||
return PLACEHOLDERS["CODE_POSTAL"]
|
||
line = RE_CODE_POSTAL.sub(_repl_code_postal, line)
|
||
|
||
# AGE
|
||
def _repl_age(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"]))
|
||
return PLACEHOLDERS["AGE"]
|
||
line = RE_AGE.sub(_repl_age, line)
|
||
|
||
# NUMERO DOSSIER / NDA
|
||
def _repl_dossier(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"]))
|
||
return PLACEHOLDERS["DOSSIER"]
|
||
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
|
||
|
||
# N° examen / N° patient imagerie (radiologie)
|
||
def _repl_num_examen(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||
return m.group(0).replace(m.group(1), PLACEHOLDERS["DOSSIER"])
|
||
line = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, line)
|
||
|
||
# Adresses lieu-dit / maison / lotissement (ex: "MAISON ARGAINA 94")
|
||
def _repl_lieu_dit(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
line = RE_ADRESSE_LIEU_DIT.sub(_repl_lieu_dit, line)
|
||
|
||
# Lieux-dits courants seuls sur une ligne (ex: "Le BOURG", "Le Village")
|
||
line = RE_LIEU_DIT_SEUL.sub(
|
||
lambda m: (audit.append(PiiHit(page_idx, "ADRESSE", m.group(1), PLACEHOLDERS["ADRESSE"])) or PLACEHOLDERS["ADRESSE"]),
|
||
line,
|
||
)
|
||
|
||
# N° EPISODE / Episode N. (pieds de page Trackare)
|
||
def _repl_episode(m: re.Match) -> str:
|
||
val = m.group(1) or m.group(2) or m.group(0)
|
||
audit.append(PiiHit(page_idx, "EPISODE", val, PLACEHOLDERS["EPISODE"]))
|
||
# Reconstruire le remplacement en gardant le préfixe et masquant la valeur
|
||
full = m.group(0)
|
||
return full[:full.find(val)] + PLACEHOLDERS["EPISODE"]
|
||
line = RE_EPISODE.sub(_repl_episode, line)
|
||
|
||
# N° venue / N° séjour (BACTERIO, Trackare)
|
||
def _repl_venue(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "NDA", m.group(0), PLACEHOLDERS["NDA"]))
|
||
full = m.group(0)
|
||
val = m.group(1)
|
||
return full[:full.find(val)] + PLACEHOLDERS["NDA"]
|
||
line = RE_VENUE_SEJOUR.sub(_repl_venue, line)
|
||
|
||
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
|
||
def _repl_etab(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"]))
|
||
return PLACEHOLDERS["ETAB"]
|
||
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
||
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
||
|
||
# Établissements par gazetteer Aho-Corasick FINESS (116K noms distinctifs)
|
||
# Note: _mask_finess_establishments() construit l'automate en lazy au premier appel
|
||
line, finess_matched = _mask_finess_establishments(line, return_matched_names=True)
|
||
for matched_name in finess_matched:
|
||
audit.append(PiiHit(page_idx, "ETAB_FINESS", matched_name, PLACEHOLDERS["ETAB"]))
|
||
|
||
# Adresses par gazetteer Aho-Corasick FINESS (28K noms de voie)
|
||
line, addr_matched = _mask_finess_addresses(line, return_matched_names=True)
|
||
for matched_addr in addr_matched:
|
||
audit.append(PiiHit(page_idx, "ADDR_FINESS", matched_addr, PLACEHOLDERS["ADRESSE"]))
|
||
|
||
# Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R D E ..."
|
||
# Les lettres majuscules séparées par des espaces échappent à toute détection normale.
|
||
# Stratégie : si un segment contient un mot-clé d'établissement, masquer TOUTE la ligne
|
||
# espacée (tous les segments contigus) pour éviter de laisser "D E L A C ÔT E B A S Q U E"
|
||
_RE_SPACED_TEXT = re.compile(
|
||
r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]'
|
||
)
|
||
# Pattern plus large : toute la chaîne espacée (lettres séparées par espaces + mots courts)
|
||
_RE_SPACED_FULL_LINE = re.compile(
|
||
r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ](?:\s|ÔT|ÉE)){3,}[\sA-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]*'
|
||
)
|
||
_SPACED_ETAB_KEYWORDS = {
|
||
"HOSPITALIER", "HOSPITALIERE", "HOSPITALIERES", "HOSPITALIERS",
|
||
"CLINIQUE", "HOPITAL", "HÔPITAL", "POLYCLINIQUE",
|
||
"CENTRE", "ETABLISSEMENT", "MAISON", "RESIDENCE",
|
||
"EHPAD", "SSR", "USLD", "CHU", "CHRU",
|
||
}
|
||
spaced_matches = list(_RE_SPACED_TEXT.finditer(line))
|
||
if spaced_matches:
|
||
# Vérifier si au moins un segment contient un mot-clé d'établissement
|
||
has_etab_keyword = False
|
||
for m_sp in spaced_matches:
|
||
collapsed_upper = m_sp.group(0).replace(" ", "").upper()
|
||
if any(kw in collapsed_upper for kw in _SPACED_ETAB_KEYWORDS):
|
||
has_etab_keyword = True
|
||
break
|
||
if has_etab_keyword:
|
||
# Masquer toute la ligne espacée (du début du premier au fin du dernier match)
|
||
full_start = spaced_matches[0].start()
|
||
full_end = spaced_matches[-1].end()
|
||
full_span = line[full_start:full_end]
|
||
audit.append(PiiHit(page_idx, "ETAB_SPACED", full_span, PLACEHOLDERS["ETAB"]))
|
||
line = line[:full_start] + PLACEHOLDERS["ETAB"] + line[full_end:]
|
||
|
||
# Villes par gazetteer Aho-Corasick (INSEE + FINESS)
|
||
if _VILLE_AC is None:
|
||
_build_ville_ac()
|
||
if _VILLE_AC is not None:
|
||
line, ville_originals = _mask_ville_gazetteers(line)
|
||
for vo in ville_originals:
|
||
audit.append(PiiHit(page_idx, "VILLE_GAZ", vo, PLACEHOLDERS["VILLE"]))
|
||
|
||
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
||
def _repl_service(m: re.Match) -> str:
|
||
full_match = m.group(0)
|
||
# Vérifier si c'est un terme structurel à préserver
|
||
if full_match.lower() in _MEDICAL_STRUCTURAL_TERMS:
|
||
return full_match
|
||
# Vérifier le contexte avant (Chef de, Praticien, etc.)
|
||
start_pos = m.start()
|
||
context_before = line[max(0, start_pos-25):start_pos].lower()
|
||
# Patterns à préserver
|
||
preserve_patterns = ['chef de', 'praticien', 'ancien', 'assistant', 'médecin', 'interne']
|
||
if any(pattern in context_before for pattern in preserve_patterns):
|
||
return full_match
|
||
audit.append(PiiHit(page_idx, "ETAB", full_match, PLACEHOLDERS["MASK"]))
|
||
return PLACEHOLDERS["MASK"]
|
||
line = RE_SERVICE.sub(_repl_service, line)
|
||
|
||
# Ville en en-tête de courrier : "Bayonne, le 12/03/2024" → masquer la ville
|
||
# Le contexte "Mot, le [date]" est fiable (virgule obligatoire)
|
||
# Autorise les mots de liaison minuscules (de, du, la, sur, en, lès)
|
||
_re_ville_date = re.compile(
|
||
r"^(\s*)"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç\-]+"
|
||
r"(?:\s+(?:de|du|la|sur|en|lès|les|l['']\s*)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)*)"
|
||
r"(\s*,\s+le\s+\d{1,2})",
|
||
re.MULTILINE,
|
||
)
|
||
def _repl_ville_date(m: re.Match) -> str:
|
||
ville = m.group(2).strip()
|
||
audit.append(PiiHit(page_idx, "VILLE", ville, PLACEHOLDERS["VILLE"]))
|
||
return m.group(1) + PLACEHOLDERS["VILLE"] + m.group(3)
|
||
line = _re_ville_date.sub(_repl_ville_date, line)
|
||
|
||
# Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
|
||
_re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)(\S.+)")
|
||
def _repl_lieu(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"]))
|
||
return m.group(1) + PLACEHOLDERS["VILLE"]
|
||
line = _re_lieu.sub(_repl_lieu, line)
|
||
|
||
_re_ville_res = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
|
||
def _repl_ville_res(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"]))
|
||
return m.group(1) + PLACEHOLDERS["VILLE"]
|
||
line = _re_ville_res.sub(_repl_ville_res, line)
|
||
|
||
# PERSON uppercase avec contexte, whitelist/acronymes courts
|
||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||
|
||
_stop_rx = re.compile(_MEDICAL_STOP_WORDS, re.IGNORECASE)
|
||
|
||
def _clean_name_span(span: str) -> str:
|
||
"""Tronque le span au premier mot médical/stop word."""
|
||
tokens = span.split()
|
||
clean = []
|
||
for t in tokens:
|
||
if _stop_rx.fullmatch(t):
|
||
break
|
||
clean.append(t)
|
||
return " ".join(clean).strip(" .-'")
|
||
|
||
def _repl_person_ctx(m: re.Match) -> str:
|
||
span = m.group(1).strip(); raw = m.group(0)
|
||
if span in wl_sections or raw in wl_phrases: return raw
|
||
# Tronquer avant les mots médicaux
|
||
cleaned = _clean_name_span(span)
|
||
if not cleaned:
|
||
return raw
|
||
tokens = [t for t in cleaned.split() if t]
|
||
if len(tokens) == 1 and len(tokens[0]) <= 4: return raw
|
||
audit.append(PiiHit(page_idx, "NOM", cleaned, PLACEHOLDERS["NOM"]))
|
||
return raw.replace(cleaned, PLACEHOLDERS["NOM"])
|
||
|
||
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
|
||
|
||
# Mr/Mme + initiale isolée : "Mme Z", "Mr R" → masquer la lettre
|
||
def _repl_civilite_init(m: re.Match) -> str:
|
||
prefix = m.group(1)
|
||
lettre = m.group(2)
|
||
audit.append(PiiHit(page_idx, "NOM", lettre, PLACEHOLDERS["NOM"]))
|
||
return prefix + PLACEHOLDERS["NOM"]
|
||
line = RE_CIVILITE_INITIALE.sub(_repl_civilite_init, line)
|
||
|
||
# Passe supplémentaire : noms dans des listes virgulées après "Dr"
|
||
# ex: "le Dr DUVAL, MACHELART, LAZARO" → masquer chaque nom
|
||
for m in RE_DR_COMMA_LIST.finditer(line):
|
||
fragment = m.group(0)
|
||
# Extraire les segments séparés par des virgules (sauf le premier qui inclut "Dr")
|
||
parts = [p.strip() for p in fragment.split(",")]
|
||
for part in parts:
|
||
# Extraire les tokens nom de chaque segment
|
||
for tok in _NAME_TOKEN_RE.findall(part):
|
||
if tok in wl_sections or len(tok) <= 3:
|
||
continue
|
||
if _stop_rx.fullmatch(tok):
|
||
continue
|
||
if tok not in line:
|
||
continue
|
||
# Vérifier qu'il n'est pas déjà masqué
|
||
if f"[{tok}]" in line or tok in {v for v in PLACEHOLDERS.values()}:
|
||
continue
|
||
audit.append(PiiHit(page_idx, "NOM", tok, PLACEHOLDERS["NOM"]))
|
||
line = re.sub(rf"\b{re.escape(tok)}\b", PLACEHOLDERS["NOM"], line)
|
||
|
||
return line
|
||
|
||
|
||
def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
|
||
"""Masque les TEL, EMAIL, ADRESSE, CODE_POSTAL même dans la partie 'clé' d'une ligne clé:valeur.
|
||
Nécessaire car des lignes comme '13 avenue ... BAYONNE - Tel : 0559' sont splitées sur ':'."""
|
||
def _repl_tel(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"]
|
||
key = RE_TEL_SLASH.sub(_repl_tel, key)
|
||
key = RE_TEL.sub(_repl_tel, key)
|
||
key = RE_TEL_COMPACT.sub(_repl_tel, key)
|
||
def _repl_email(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||
return PLACEHOLDERS["EMAIL"]
|
||
key = RE_EMAIL.sub(_repl_email, key)
|
||
# ADRESSE
|
||
def _repl_adresse(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
key = RE_ADRESSE.sub(_repl_adresse, key)
|
||
# CODE_POSTAL (inclut la ville)
|
||
def _repl_cp(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
|
||
return PLACEHOLDERS["CODE_POSTAL"]
|
||
key = RE_CODE_POSTAL.sub(_repl_cp, key)
|
||
# FINESS adresses Aho-Corasick
|
||
key, addr_matched = _mask_finess_addresses(key, return_matched_names=True)
|
||
for matched_addr in addr_matched:
|
||
audit.append(PiiHit(page_idx, "ADDR_FINESS", matched_addr, PLACEHOLDERS["ADRESSE"]))
|
||
return key
|
||
|
||
|
||
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||
line = _mask_admin_label(line, audit, page_idx)
|
||
parts = SPLITTER.split(line, maxsplit=1)
|
||
if len(parts) == 2:
|
||
key, value = parts
|
||
masked_key = _mask_critical_in_key(key, audit, page_idx)
|
||
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
|
||
return f"{masked_key.strip()} : {masked_val.strip()}"
|
||
else:
|
||
return _mask_line_by_regex(line, audit, page_idx, cfg)
|
||
|
||
# ----------------- Extraction globale de noms -----------------
|
||
|
||
def _is_trackare_document(text: str) -> bool:
|
||
"""Détecte si le document est un export Trackare/TrakCare (DPI structuré)."""
|
||
markers = ["Détails des patients", "Nom de naissance", "Dossier Patient"]
|
||
t = text[:3000].lower()
|
||
return sum(1 for m in markers if m.lower() in t) >= 2
|
||
|
||
|
||
def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||
"""Parse les champs structurés d'un document Trackare pour extraire les PII.
|
||
Retourne (name_tokens, pii_hits) avec les noms à masquer et les hits additionnels."""
|
||
names: set = set()
|
||
hits: List[PiiHit] = []
|
||
|
||
force_names: set = set() # noms issus de contextes structurés (DR., Signé, etc.) → bypass stop words
|
||
|
||
def _add_name(s: str):
|
||
s = s.strip()
|
||
parts = s.split()
|
||
for tok in parts:
|
||
tok = tok.strip(" .-'(),")
|
||
if len(tok) >= 4 and tok[0].isupper():
|
||
names.add(tok)
|
||
# Garder aussi le nom composé complet (DI LULLO, LE MOIGNE, etc.)
|
||
if len(parts) >= 2:
|
||
compound = " ".join(t.strip(" .-'(),") for t in parts if len(t.strip(" .-'(),")) >= 2)
|
||
if len(compound) >= 5:
|
||
names.add(compound)
|
||
|
||
# Termes non-noms fréquents dans les contextes Signé/DR./Note d'évolution
|
||
_FORCE_EXCLUDE = _MEDICATION_WHITELIST | {
|
||
"elimination", "élimination", "forte", "intraveineuse", "lavage",
|
||
"sonde", "normal", "réalisé", "realise", "germes", "bbm", "arw",
|
||
"orale", "sachet", "injectable", "comprime", "comprimé", "gelule",
|
||
"gélule", "seringue", "poche", "flacon", "ampoule", "preremplie",
|
||
"préremplie",
|
||
}
|
||
|
||
def _add_name_force(tok: str):
|
||
"""Ajoute un nom depuis un contexte structuré fiable (DR., Signé direct, Note d'évolution).
|
||
Bypass les stop words généraux mais filtre médicaments et termes de soins courants."""
|
||
tok = tok.strip(" .-'(),")
|
||
if len(tok) < 4 or not tok[0].isupper():
|
||
return
|
||
if tok.lower() in _FORCE_EXCLUDE:
|
||
return
|
||
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
return
|
||
names.add(tok)
|
||
force_names.add(tok)
|
||
|
||
# --- Identité patient ---
|
||
# Nom de naissance: DIEGO (peut apparaître 2x : en-tête + récap tabulaire)
|
||
for m in re.finditer(r"Nom\s+de\s+naissance\s*:\s*(.+?)(?:\s+IPP\b|\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip())
|
||
|
||
# Nom et Prénom: DIEGO PATRICIA
|
||
for m in re.finditer(r"Nom\s+et\s+Pr[ée]nom\s*:\s*(.+?)(?:\s+Date\s+de\s+naissance|\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip())
|
||
|
||
# Prénom de naissance / Prénom utilisé : REGINA
|
||
for m in re.finditer(r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-']+?)(?:\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip())
|
||
|
||
# Lieu de naissance: BAYONNE, biarritz, 64102, 99999 → masquer comme VILLE
|
||
for m in re.finditer(r"Lieu\s+de\s+naissance\s*:\s*(\S[^\n]*?)(?:\s*$)", full_text, re.MULTILINE):
|
||
val = m.group(1).strip()
|
||
if val:
|
||
hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"]))
|
||
# Ajouter au set names seulement si alphabétique (pas les codes INSEE numériques)
|
||
if re.match(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç]", val):
|
||
names.add(val)
|
||
|
||
# Ville de résidence: TARNOS → masquer comme VILLE
|
||
for m in re.finditer(r"Ville\s+de\s+r[ée]sidence\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\s\-']+?)(?:\s*$)", full_text, re.MULTILINE):
|
||
val = m.group(1).strip()
|
||
hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"]))
|
||
names.add(val)
|
||
|
||
# Code Postal (toutes occurrences)
|
||
for m in re.finditer(r"[Cc]ode\s*[Pp]ostal\s*:\s*(\d{5})", full_text):
|
||
hits.append(PiiHit(-1, "CODE_POSTAL", m.group(1), PLACEHOLDERS["CODE_POSTAL"]))
|
||
|
||
# N° épisode (= NDA, identifiant de séjour)
|
||
for m in re.finditer(r"Episode\s*N[o°.]?\s*\.?\s*:\s*(\d{5,})", full_text):
|
||
hits.append(PiiHit(-1, "EPISODE", m.group(1), PLACEHOLDERS.get("NDA", "[NDA]")))
|
||
|
||
# RPPS isolés (11 chiffres commençant par 1 ou 2, seul sur une ligne ou en fin de ligne)
|
||
for m in re.finditer(r"^\s*([12]\d{10})\s*$", full_text, re.MULTILINE):
|
||
hits.append(PiiHit(-1, "RPPS", m.group(1), PLACEHOLDERS["RPPS"]))
|
||
|
||
# Adresse patient (toutes les occurrences)
|
||
for m in re.finditer(r"Adresse\s*:\s*(.+?)(?:\s+Ville\s+de\s+r[ée]sidence|\s*$)", full_text, re.MULTILINE):
|
||
val = m.group(1).strip()
|
||
if len(val) > 3:
|
||
hits.append(PiiHit(-1, "ADRESSE", val, PLACEHOLDERS["ADRESSE"]))
|
||
|
||
# --- Pied de page : "Patient : NOM PRENOM - Date de naissance..." ---
|
||
for m in re.finditer(r"Patient\s*:\s*(.+?)\s*-\s*Date\s+de\s+naissance", full_text):
|
||
_add_name(m.group(1).strip())
|
||
|
||
# --- Médecin courant (toutes occurrences) ---
|
||
for m in re.finditer(r"Médecin\s+courant\s*:\s*(?:DR\.?\s*)?(.+?)(?:\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip())
|
||
|
||
# --- Médecin traitant (ligne après "Nom Adresse Téléphone") ---
|
||
for m in re.finditer(r"Médecin\s+traitant\s*\n.*?Nom\s+Adresse\s+Téléphone\s*\n\s*(?:DR\.?\s*)?(.+?)(?:\d{5}|\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip())
|
||
|
||
# --- Contacts structurés ---
|
||
# Pattern: Relation NOM PRENOM [ADRESSE] [TEL]
|
||
# Accepte les minuscules (Trackare écrit parfois "Conjoint vandestock michele")
|
||
# Capture jusqu'à 3 tokens pour les noms composés (le moigne christophe)
|
||
# Inclut "Personne à prévenir" + relations + Ami/Voisin/Autre
|
||
for m in re.finditer(
|
||
r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur|Ami|Amie|Voisin|Voisine|Autre)\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?",
|
||
full_text,
|
||
):
|
||
contact_parts = [g.strip(" .-'(),") for g in (m.group(1), m.group(2), m.group(3)) if g]
|
||
# Ajouter chaque token >= 4 chars (pas les articles courts comme "le", "di", ni acronymes 3 lettres)
|
||
for tok in contact_parts:
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
names.add(tok)
|
||
if tok[0].islower():
|
||
names.add(tok.capitalize())
|
||
# Ajouter aussi le composé complet (pour "le moigne", "di lullo")
|
||
if len(contact_parts) >= 2:
|
||
compound = " ".join(contact_parts)
|
||
if len(compound) >= 5:
|
||
names.add(compound)
|
||
# Version capitalisée pour propagation
|
||
names.add(" ".join(t.capitalize() for t in compound.split()))
|
||
|
||
# --- Personne à prévenir : bloc structuré multi-lignes ---
|
||
# Format Trackare : "Personne à prévenir\nRelation\nNOM\nPrenom" ou
|
||
# "Personne à prévenir\nRelation NOM Prenom\nAdresse..."
|
||
for m in re.finditer(
|
||
r"[Pp]ersonne\s+[àa]\s+pr[ée]venir\s*[:\-]?\s*\n"
|
||
r"(?:[^\n]{0,30}\n){0,2}" # 0-2 lignes intermédiaires (relation, etc.)
|
||
r"\s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?"
|
||
r"(?:\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?",
|
||
full_text,
|
||
):
|
||
for g in (m.group(1), m.group(2), m.group(3)):
|
||
if g:
|
||
tok = g.strip(" .-'(),")
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
names.add(tok)
|
||
if tok[0].islower():
|
||
names.add(tok.capitalize())
|
||
|
||
# --- Prescripteurs / Exécutants (trackare) ---
|
||
for m in re.finditer(
|
||
r"(?:Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*"
|
||
r"(?:(?:Dr|Pr)\.?\s+)?"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+))?",
|
||
full_text,
|
||
):
|
||
_add_name(m.group(1))
|
||
if m.group(2):
|
||
_add_name(m.group(2))
|
||
|
||
# --- Médecins urgences (IAO, prise en charge, décision) ---
|
||
for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)", full_text):
|
||
_add_name(m.group(1))
|
||
for m in re.finditer(
|
||
r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text,
|
||
):
|
||
_add_name(m.group(1))
|
||
if m.group(2):
|
||
_add_name(m.group(2))
|
||
|
||
# --- Noms soignants dans les Notes d'évolution / Notes IDE / Notes médicales ---
|
||
# Pattern: "Note IDE\nPrenom NOM" ou "Note d'évolution\nPrenom NOM"
|
||
for m in re.finditer(
|
||
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s*\n\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][a-zéèàùâêîôûäëïöüç]+)\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)",
|
||
full_text
|
||
):
|
||
prenom, nom = m.group(1), m.group(2)
|
||
if prenom.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(prenom)
|
||
if nom.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(nom)
|
||
|
||
# --- Noms soignants multi-lignes : "Prénom\nNOM" dans les tableaux de prescriptions/soins ---
|
||
for m in re.finditer(
|
||
r'\b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,})\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛ]{4,})\b',
|
||
full_text
|
||
):
|
||
prenom, nom = m.group(1), m.group(2)
|
||
if prenom.lower() not in _MEDICAL_STOP_WORDS_SET and nom.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(prenom)
|
||
_add_name(nom)
|
||
|
||
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
|
||
for m in re.finditer(
|
||
r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
|
||
r"(?:DR\.?[ \t]+)?"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
if len(tok) >= 4:
|
||
_add_name_force(tok)
|
||
|
||
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
|
||
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
|
||
for m in re.finditer(
|
||
r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
if len(tok) >= 4:
|
||
_add_name_force(tok)
|
||
|
||
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
|
||
for m in re.finditer(
|
||
r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{3,})"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,}))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(tok)
|
||
|
||
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
|
||
for m in re.finditer(
|
||
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{3,})",
|
||
full_text
|
||
):
|
||
tok = m.group(1).rstrip('-')
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(tok)
|
||
|
||
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
|
||
for m in re.finditer(
|
||
r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,})"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.strip()
|
||
if len(tok) >= 4:
|
||
_add_name_force(tok)
|
||
|
||
# --- Noms soignants après timestamps dans activités de soins (ex: "07:00 ETCHEBARNE") ---
|
||
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
|
||
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
|
||
for m in re.finditer(
|
||
r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,}))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(tok)
|
||
|
||
# Filtrer les tokens trop courts ou stop words
|
||
# Exceptions : force_names (contextes structurés) et city_tokens (villes extraites)
|
||
city_tokens = {h.original for h in hits if h.kind == "VILLE"}
|
||
filtered = set()
|
||
for tok in names:
|
||
if tok in city_tokens or tok in force_names:
|
||
filtered.add(tok)
|
||
continue
|
||
if len(tok) < 4:
|
||
continue
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
filtered.add(tok)
|
||
|
||
return filtered, hits, force_names
|
||
|
||
|
||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]:
|
||
"""Pré-scan du document brut pour extraire les noms de personnes
|
||
depuis les champs structurés (Patient, Rédigé par, etc.).
|
||
Retourne (names, force_names) : ensemble de tokens à masquer,
|
||
et sous-ensemble qui bypass les stop words."""
|
||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||
names: set = set()
|
||
force_names: set = set()
|
||
|
||
def _add_compound(match_str: str):
|
||
"""Ajoute le nom composé complet en plus des tokens individuels (DI LULLO, LE MOIGNE)."""
|
||
parts = [t.strip(" .-'") for t in match_str.split() if len(t.strip(" .-'")) >= 2]
|
||
if len(parts) >= 2:
|
||
compound = " ".join(parts)
|
||
if len(compound) >= 5:
|
||
names.add(compound)
|
||
|
||
def _add_tokens(match_str: str):
|
||
_add_compound(match_str)
|
||
for token in match_str.split():
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
continue
|
||
if token.upper() in wl_sections or token in wl_phrases:
|
||
continue
|
||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
names.add(token)
|
||
|
||
def _add_tokens_force_all(match_str: str):
|
||
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
|
||
_add_compound(match_str)
|
||
for token in match_str.split():
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
continue
|
||
if token.upper() in wl_sections or token in wl_phrases:
|
||
continue
|
||
names.add(token)
|
||
force_names.add(token)
|
||
|
||
def _add_tokens_force_first(match_str):
|
||
"""Comme _add_tokens mais force TOUS les tokens (contexte Dr/Mme fort).
|
||
|
||
Après Dr/Mme, tous les tokens sont des noms — même s'ils sont
|
||
homonymes de termes médicaux (ex: Dr Laurence MASSE).
|
||
"""
|
||
_add_compound(match_str)
|
||
tokens = match_str.split()
|
||
for token in tokens:
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
continue
|
||
if token.upper() in wl_sections or token in wl_phrases:
|
||
continue
|
||
names.add(token)
|
||
force_names.add(token)
|
||
|
||
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1))
|
||
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
||
_add_tokens(m.group(1))
|
||
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1))
|
||
for m in RE_EXTRACT_DR_DEST.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1))
|
||
# Champs d'identité structurés (trackare / DPI)
|
||
for m in RE_EXTRACT_NOM_NAISSANCE.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1))
|
||
for m in RE_EXTRACT_NOM_UTILISE.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1))
|
||
for m in RE_EXTRACT_NOM_PRENOM.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1))
|
||
for m in RE_EXTRACT_PRENOM.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1))
|
||
for m in RE_EXTRACT_LIEU_NAISSANCE.finditer(full_text):
|
||
_add_tokens(m.group(1))
|
||
for m in RE_EXTRACT_VILLE_RESIDENCE.finditer(full_text):
|
||
_add_tokens(m.group(1))
|
||
# Contacts structurés (conjoint, concubin, etc.)
|
||
for m in RE_EXTRACT_CONTACT.finditer(full_text):
|
||
_add_tokens(m.group(1))
|
||
if m.group(2):
|
||
_add_tokens(m.group(2))
|
||
# Personnel médical avec rôle (Aide, Cadre Infirmier, Prescripteur, etc.)
|
||
for m in RE_EXTRACT_STAFF_ROLE.finditer(full_text):
|
||
_add_tokens(m.group(1))
|
||
# Pr / Professeur + nom(s)
|
||
for m in RE_EXTRACT_PR.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1))
|
||
# Opérateur / Anesthésiste / Chirurgien + nom(s)
|
||
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1))
|
||
# Nom de cabinet (ex: "CABINET ETXEBARNONDOA")
|
||
for m in RE_EXTRACT_CABINET.finditer(full_text):
|
||
_add_tokens(m.group(1))
|
||
# En-tête "Courrier Epi - NOM, PRENOM" (lettres de sortie)
|
||
for m in RE_EXTRACT_COURRIER.finditer(full_text):
|
||
# Format "NOM, PRENOM" : chaque partie est un token de nom
|
||
for part in m.group(1).split(","):
|
||
part = part.strip()
|
||
if part:
|
||
_add_tokens_force_all(part)
|
||
|
||
# Extraction des noms dans les listes virgulées après Dr/Docteur ou Mmes/Mme
|
||
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
|
||
# ex: "Mmes Anorga, Goyenaga, Martinez et Murcy"
|
||
for m_comma in RE_DR_COMMA_LIST.finditer(full_text):
|
||
fragment = m_comma.group(0)
|
||
parts = [p.strip() for p in fragment.split(",")]
|
||
for part in parts:
|
||
for tok in _NAME_TOKEN_RE.findall(part):
|
||
tok = tok.strip(" .-'")
|
||
if len(tok) < 4:
|
||
continue
|
||
if tok.upper() in wl_sections or tok in wl_phrases:
|
||
continue
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
names.add(tok)
|
||
_CIVILITE_PREFIX_RE = re.compile(
|
||
r"^\s*(?:Mmes|Mme|Mesdames|Mrs|Mr\.?|Messieurs|Monsieur|Madame|Dr\.?|Docteur)\s+",
|
||
re.IGNORECASE,
|
||
)
|
||
for m_comma in RE_CIVILITE_COMMA_LIST.finditer(full_text):
|
||
fragment = m_comma.group(0)
|
||
# Retirer le préfixe civilité (Mmes, Dr, etc.) de la première partie
|
||
fragment = _CIVILITE_PREFIX_RE.sub("", fragment)
|
||
parts = [p.strip() for p in fragment.split(",")]
|
||
for part in parts:
|
||
# Nettoyer "et " en début de token
|
||
part = re.sub(r"^\s*et\s+", "", part)
|
||
for tok in _NAME_TOKEN_RE.findall(part):
|
||
tok = tok.strip(" .-'")
|
||
if len(tok) < 4:
|
||
continue
|
||
if tok.upper() in wl_sections or tok in wl_phrases:
|
||
continue
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
names.add(tok)
|
||
|
||
# Dr X.NOM / Pr X.NOM : initiale collée au nom (ex: "Dr E.ELLIE", "Pr J.DUPONT")
|
||
_RE_DR_INITIAL_DOT_NAME = re.compile(
|
||
r"\b(?:Dr\.?|Docteur|Pr\.?|Professeur)[ \t]+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]{2,})"
|
||
)
|
||
for m in _RE_DR_INITIAL_DOT_NAME.finditer(full_text):
|
||
names.add(m.group(2)) # Le nom (ELLIE)
|
||
force_names.add(m.group(2))
|
||
# Ajouter aussi "X.NOM" complet pour le raster (token collé)
|
||
names.add(f"{m.group(1)}.{m.group(2)}")
|
||
force_names.add(f"{m.group(1)}.{m.group(2)}")
|
||
|
||
# En-têtes email : "De : Prénom NOM <email>", "À : Prénom NOM <email>"
|
||
# Gère \xa0 (non-breaking space) et absence d'espace après ":"
|
||
_RE_EMAIL_HEADER = re.compile(
|
||
r"(?:De|From|À|A|To|Cc|Cci|Bcc)[\s\xa0]*:[\s\xa0]*"
|
||
r"([A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+(?:[\s\xa0]+[A-ZÀ-ÖØ-Ý][A-ZÀ-ÖØ-Ýa-zà-öø-ÿ\-]+)+)"
|
||
r"[\s\xa0]*<[^>]+>",
|
||
re.MULTILINE
|
||
)
|
||
for m in _RE_EMAIL_HEADER.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1))
|
||
|
||
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
|
||
# ajouter aussi les parties individuelles pour capturer les occurrences standalone.
|
||
# _apply_extracted_names traite le composé en premier (plus long) puis les parties.
|
||
# Les parties sont forcées (bypass stop words) car le composé lui-même est un nom
|
||
# confirmé — ex: "BILLON-GRAND" → "GRAND" doit être masqué même si "grand" est
|
||
# un mot courant, car c'est un composant d'un nom de personne détecté.
|
||
compound_names = {n for n in names if "-" in n}
|
||
for compound in compound_names:
|
||
for part in compound.split("-"):
|
||
part = part.strip()
|
||
if len(part) >= 3:
|
||
names.add(part)
|
||
force_names.add(part)
|
||
|
||
return names, force_names
|
||
|
||
|
||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
|
||
"""Remplace globalement chaque nom extrait dans le texte."""
|
||
placeholder = PLACEHOLDERS["NOM"]
|
||
_force = force_names or set()
|
||
safe_names = {n for n in names if len(n) >= 4 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
||
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
|
||
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
|
||
for token in sorted(safe_names, key=len, reverse=True):
|
||
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
|
||
for token in sorted(safe_names, key=len, reverse=True):
|
||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||
new_text = []
|
||
last_end = 0
|
||
for m in pattern.finditer(text):
|
||
# Ne pas remplacer si déjà dans un placeholder
|
||
ctx_start = max(0, m.start() - 1)
|
||
ctx_end = min(len(text), m.end() + 1)
|
||
if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]:
|
||
continue
|
||
# Ne pas remplacer si le token fait partie d'un mot composé (tiret + lettre)
|
||
# Ex: "NOCENT-EJNAINI" → ne pas remplacer NOCENT seul
|
||
# Mais "LACLAU-" (tiret de troncature) → remplacer
|
||
# Gère aussi le cas cross-line : "BILLON-\nGRAND" (nom intact)
|
||
# mais pas "[NOM]-\nGRAND" (déjà partiellement masqué → on remplace)
|
||
if m.start() > 0 and text[m.start() - 1] == "-":
|
||
if m.start() >= 2 and text[m.start() - 2].isalpha():
|
||
continue
|
||
# Cross-line: "\n" juste avant, tiret avant le "\n", lettre avant le tiret
|
||
if m.start() > 1 and text[m.start() - 1] == "\n" and text[m.start() - 2] == "-":
|
||
pre_pos = m.start() - 3
|
||
if pre_pos >= 0 and text[pre_pos].isalpha():
|
||
continue # Composé intact (BILLON-\nGRAND) → skip
|
||
# Si le tiret est après un placeholder ([NOM]-\nGRAND) → on remplace
|
||
if m.end() < len(text) and text[m.end()] == "-":
|
||
if m.end() + 1 < len(text) and text[m.end() + 1].isalpha():
|
||
continue
|
||
# DÉSACTIVÉ: NOM_EXTRACTED génère 3,846 FP (77.7% du total) avec 0 TP
|
||
# Cette logique d'extraction de noms est trop agressive et crée des faux positifs massifs
|
||
# audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder))
|
||
new_text.append(text[last_end:m.start()])
|
||
new_text.append(placeholder)
|
||
last_end = m.end()
|
||
new_text.append(text[last_end:])
|
||
text = "".join(new_text)
|
||
return text
|
||
|
||
|
||
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
|
||
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.).
|
||
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
|
||
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
|
||
_APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"}
|
||
# Collecter les valeurs à remplacer, groupées par placeholder
|
||
replacements: Dict[str, str] = {} # original → placeholder
|
||
for h in audit:
|
||
if h.kind in _APPLY_KINDS and h.original and len(h.original.strip()) >= 4:
|
||
replacements[h.original.strip()] = h.placeholder
|
||
# Remplacer les plus longs d'abord (éviter les remplacements partiels)
|
||
for original in sorted(replacements, key=len, reverse=True):
|
||
placeholder = replacements[original]
|
||
escaped = re.escape(original)
|
||
# Word boundary pour ne pas casser les mots (ex: ONDANSETRON)
|
||
text = re.sub(rf"\b{escaped}\b", placeholder, text)
|
||
# Aussi gérer les formats avec astérisques (*640000162*)
|
||
text = re.sub(rf"\*{escaped}\*", placeholder, text)
|
||
return text
|
||
|
||
|
||
# ----------------- Anonymisation (regex) -----------------
|
||
|
||
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any]) -> AnonResult:
|
||
audit: List[PiiHit] = []
|
||
|
||
# Phase 0 : extraction globale des noms depuis les champs structurés
|
||
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||
"\n".join(rows) for rows in tables_lines
|
||
)
|
||
extracted_names, doc_force_names = _extract_document_names(full_raw, cfg)
|
||
|
||
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
||
is_trackare = _is_trackare_document(full_raw)
|
||
trackare_force_names: set = set()
|
||
if is_trackare:
|
||
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
|
||
extracted_names.update(trackare_names)
|
||
audit.extend(trackare_hits)
|
||
# Fusionner les force_names des deux sources
|
||
all_force_names = doc_force_names | trackare_force_names
|
||
|
||
# Phase 0c-url : détection et masquage des URLs (y compris coupées par saut de ligne)
|
||
# Ex: "https://courrier\n.avenir-numerique.fr/owa/#path=/mail/inbox"
|
||
_RE_URL_MULTILINE = re.compile(
|
||
r"(https?://\S+)\n(\.[a-zA-Z0-9\-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?)",
|
||
re.MULTILINE
|
||
)
|
||
for m in _RE_URL_MULTILINE.finditer(full_raw):
|
||
full_url = m.group(1) + m.group(2)
|
||
audit.append(PiiHit(-1, "URL", full_url, PLACEHOLDERS["MASK"]))
|
||
# Aussi masquer les deux parties séparément (pour le matching ligne par ligne)
|
||
audit.append(PiiHit(-1, "URL", m.group(1), PLACEHOLDERS["MASK"]))
|
||
audit.append(PiiHit(-1, "URL", m.group(2).lstrip("."), PLACEHOLDERS["MASK"]))
|
||
|
||
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
|
||
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
|
||
_RE_FINESS_MULTILINE = re.compile(
|
||
r"(?:N°\s*)?[Ff]iness?\s*\n(?:[^\n]*\n){0,2}\s*\*?(\d{9})\*?", re.MULTILINE
|
||
)
|
||
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
|
||
|
||
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
|
||
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
|
||
# Variante large : tolère 0-3 lignes intermédiaires (tableaux BACTERIO)
|
||
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
|
||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n"
|
||
r"(?:[^\n]*\n){0,3}\s*"
|
||
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||
|
||
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
|
||
_RE_IPP_MULTILINE = re.compile(
|
||
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_IPP_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
|
||
|
||
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
|
||
_RE_DEMANDE_MULTILINE = re.compile(
|
||
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||
|
||
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
|
||
_RE_VENUE_MULTILINE = re.compile(
|
||
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||
# Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
|
||
_RE_VENUE_REVERSE = re.compile(
|
||
r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
|
||
re.IGNORECASE | re.MULTILINE,
|
||
)
|
||
for m in _RE_VENUE_REVERSE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||
|
||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||
out_pages: List[str] = []
|
||
for i, page_txt in enumerate(pages_text):
|
||
lines = [ln for ln in (page_txt or "").splitlines()]
|
||
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||
out_pages.append("\n".join(masked))
|
||
table_blocks: List[str] = []
|
||
for i, rows in enumerate(tables_lines):
|
||
mbuf: List[str] = []
|
||
for r in rows:
|
||
masked = _kv_value_only_mask(r, audit, i, cfg)
|
||
mbuf.append(masked)
|
||
if mbuf:
|
||
table_blocks.append("\n".join(mbuf))
|
||
tables_block = "\n\n".join(table_blocks)
|
||
text_out = "\f".join(out_pages) # séparateur de pages
|
||
# NOTE: on n'ajoute PAS le bloc [TABLES] au text_out.
|
||
# pdfplumber extrait souvent le contenu principal comme "table", créant un doublon
|
||
# intégral du texte. Ce doublon échappait au NER et au rescan (protégé par les
|
||
# marqueurs [TABLES]), et le NER EDS-pseudo corrompait les marqueurs en changeant
|
||
# la longueur du texte → fuite PII massive (dates de naissance, adresses, noms).
|
||
# Les PII détectés dans les tables sont toujours dans l'audit (Phase 1 regex).
|
||
|
||
# Phase 2 : application globale des noms extraits (rattrapage)
|
||
if extracted_names:
|
||
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
|
||
|
||
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
|
||
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
||
|
||
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit, is_trackare=is_trackare)
|
||
|
||
# ----------------- NER ONNX sur narratif -----------------
|
||
|
||
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||
# remplace via regex sur les 'word' détectés (approche pragmatique)
|
||
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False))
|
||
def repl_once(s: str, old: str, new: str) -> str:
|
||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||
out = text
|
||
for e in ents:
|
||
w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper()
|
||
if not w or "[" in w or "]" in w: # ignore placeholders
|
||
continue
|
||
if len(w) <= 2: # trop court
|
||
continue
|
||
if grp in {"PER", "PERSON"}:
|
||
audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"]))
|
||
out = repl_once(out, w, PLACEHOLDERS["NOM"])
|
||
elif grp in {"ORG"}:
|
||
if keep_org_gpe:
|
||
continue
|
||
audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"]))
|
||
out = repl_once(out, w, PLACEHOLDERS["ETAB"])
|
||
elif grp in {"LOC"}:
|
||
if keep_org_gpe:
|
||
continue
|
||
audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"]))
|
||
out = repl_once(out, w, PLACEHOLDERS["VILLE"])
|
||
elif grp in {"DATE"}:
|
||
# facultatif : si vous masquez déjà les dates via règles, laissez tel quel
|
||
continue
|
||
return out
|
||
|
||
|
||
def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]:
|
||
if manager is None or not manager.is_loaded():
|
||
return text_out, []
|
||
# isoler [TABLES]
|
||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||
tables: List[Tuple[int,int,str]] = []
|
||
keep = []
|
||
last = 0
|
||
cleaned = ""
|
||
for m in pattern.finditer(text_out):
|
||
cleaned += text_out[last:m.start()]
|
||
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||
cleaned += "\x00" * len(m.group(0))
|
||
last = m.end()
|
||
cleaned += text_out[last:]
|
||
|
||
# par pages (séparées par \f) → par paragraphes
|
||
pages = cleaned.split("\f")
|
||
hits: List[PiiHit] = []
|
||
rebuilt_pages: List[str] = []
|
||
for pg in pages:
|
||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||
ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds)
|
||
# remplace entités
|
||
idx = 0
|
||
buf = []
|
||
for para, ents in zip(paras, ents_per_para):
|
||
masked = _mask_with_hf(para, ents, cfg, hits)
|
||
buf.append(masked)
|
||
rebuilt_pages.append("\n\n".join(buf))
|
||
rebuilt = "\f".join(rebuilt_pages)
|
||
|
||
# réinsérer [TABLES]
|
||
rebuilt_list = list(rebuilt)
|
||
for start, end, payload in keep:
|
||
rebuilt_list[start:end] = list(payload)
|
||
final = "".join(rebuilt_list)
|
||
return final, hits
|
||
|
||
# ----------------- NER EDS-Pseudo sur narratif -----------------
|
||
|
||
def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||
"""Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key."""
|
||
def repl_once(s: str, old: str, new: str) -> str:
|
||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||
out = text
|
||
for e in ents:
|
||
w = e.get("word") or ""
|
||
mapped_key = e.get("eds_mapped_key", "")
|
||
if not w or "[" in w or "]" in w:
|
||
continue
|
||
if len(w) <= 2:
|
||
continue
|
||
# Filtrer les faux positifs NOM/PRENOM (médicaments, acronymes médicaux)
|
||
label = e.get("entity_group", "EDS")
|
||
if label in ("NOM", "PRENOM", "HOPITAL", "VILLE"):
|
||
if w.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
# Filtrer aussi les tokens multi-mots dont un composant est un stop word
|
||
if " " in w and any(part.lower() in _MEDICAL_STOP_WORDS_SET for part in w.split()):
|
||
continue
|
||
# Filtrer les dosages détectés comme noms (ex: "10MG", "300UI", "1 000")
|
||
if re.match(r"^\d[\d\s]*(?:mg|MG|ml|ML|UI|µg|mcg|g|kg|%)?$", w.strip()):
|
||
continue
|
||
# CORRECTION 1.2: Filtrer les médicaments détectés comme NOM/PRENOM
|
||
if label in ("NOM", "PRENOM"):
|
||
# Vérifier si c'est un médicament connu
|
||
if w.lower() in _MEDICATION_WHITELIST:
|
||
continue
|
||
# Chantier 3+4+5 : Confiance NER + vote croisé GLiNER + CamemBERT-bio + gazetteers INSEE
|
||
# Sécurité d'abord : haute confiance NER → toujours masquer
|
||
# GLiNER/CamemBERT peuvent rejeter SEULEMENT si confiance NER basse
|
||
gliner_vote = e.get("gliner_confirmed") # True=PII, False=médical, None=neutre
|
||
camembert_vote = e.get("camembert_confirmed") # True=PII confirmé, False=non détecté, None=neutre
|
||
if label in ("NOM", "PRENOM"):
|
||
score = e.get("score", 1.0)
|
||
# Gazetteer INSEE : prénom connu = renforcement confiance (ne pas filtrer)
|
||
is_known_prenom = w.lower() in _INSEE_PRENOMS
|
||
if isinstance(score, float) and score < 0.70 and not is_known_prenom:
|
||
# Basse confiance NER + pas un prénom connu
|
||
if gliner_vote is False and camembert_vote is not True:
|
||
continue # GLiNER dit "médical" + CamemBERT ne confirme pas → skip
|
||
if score < 0.30 and camembert_vote is not True:
|
||
continue # Très basse confiance + CamemBERT ne confirme pas → skip
|
||
# Chantier 2 : Safe patterns contextuels (Philter-style)
|
||
# Token suivi/précédé de dosages ou formes pharma → jamais un nom de personne
|
||
pos = text.find(w)
|
||
if pos >= 0:
|
||
# Contexte MÊME LIGNE seulement ([ \t] pas \n)
|
||
line_start = text.rfind('\n', 0, pos)
|
||
line_start = 0 if line_start < 0 else line_start + 1
|
||
line_end = text.find('\n', pos + len(w))
|
||
line_end = len(text) if line_end < 0 else line_end
|
||
ctx_before = text[max(line_start, pos - 30):pos]
|
||
ctx_after = text[pos + len(w):min(line_end, pos + len(w) + 30)]
|
||
# Safe pattern: précédé ou suivi d'un dosage (mg, mL, UI, comprimé, etc.)
|
||
_RE_DOSAGE = r"\d+[ \t]*(?:mg|ml|ui|µg|mcg|g|kg|cp|cpr|gel|amp|fl|dos|inh)\b"
|
||
if re.search(_RE_DOSAGE, ctx_before, re.IGNORECASE):
|
||
continue
|
||
if re.search(_RE_DOSAGE, ctx_after, re.IGNORECASE):
|
||
continue
|
||
# Safe pattern: suivi d'une forme pharmaceutique
|
||
_RE_PHARMA_FORM = r"^\s*(?:comprim[ée]s?|g[ée]lules?|sachets?|ampoules?|flacons?|solutions?|injectable|suppo(?:sitoire)?s?|sirop|pommade|cr[eè]me|gouttes?|patch|inhal)"
|
||
if re.search(_RE_PHARMA_FORM, ctx_after, re.IGNORECASE):
|
||
continue
|
||
# Safe pattern: précédé de "taux de", "score de", "dosage de"
|
||
if re.search(r"(?:taux|score|dosage|indice|index|grade|stade|type)\s+(?:de\s+)?$", ctx_before, re.IGNORECASE):
|
||
continue
|
||
elif label == "HOPITAL":
|
||
_STRUCTURAL_WORDS = {"SERVICE", "POLE", "PÔLE", "UNITE", "UNITÉ", "SECTEUR"}
|
||
if len(w) < 5:
|
||
continue
|
||
if w.upper() in _STRUCTURAL_WORDS:
|
||
continue
|
||
placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"])
|
||
audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder))
|
||
out = repl_once(out, w, placeholder)
|
||
return out
|
||
|
||
|
||
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager",
|
||
gliner_mgr: Any = None,
|
||
camembert_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
|
||
"""Applique EDS-Pseudo sur le narratif avec validation croisée GLiNER optionnelle."""
|
||
if manager is None or not manager.is_loaded():
|
||
return text_out, []
|
||
# isoler [TABLES]
|
||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||
keep = []
|
||
last = 0
|
||
cleaned = ""
|
||
for m in pattern.finditer(text_out):
|
||
cleaned += text_out[last:m.start()]
|
||
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||
cleaned += "\x00" * len(m.group(0))
|
||
last = m.end()
|
||
cleaned += text_out[last:]
|
||
|
||
# par pages → par paragraphes
|
||
pages = cleaned.split("\f")
|
||
hits: List[PiiHit] = []
|
||
rebuilt_pages: List[str] = []
|
||
for pg in pages:
|
||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||
ents_per_para = manager.infer_paragraphs(paras)
|
||
# Chantier 4 : Validation croisée GLiNER (vote majoritaire)
|
||
if gliner_mgr is not None and hasattr(gliner_mgr, 'validate_entities') and gliner_mgr.is_loaded():
|
||
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
|
||
ents_per_para[i] = gliner_mgr.validate_entities(para, ents, threshold=0.4)
|
||
# Chantier 5 : Validation croisée CamemBERT-bio (vote NER fine-tuné)
|
||
if camembert_mgr is not None and hasattr(camembert_mgr, 'validate_eds_entities') and camembert_mgr.is_loaded():
|
||
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
|
||
ents_per_para[i] = camembert_mgr.validate_eds_entities(para, ents, threshold=0.3)
|
||
buf = []
|
||
for para, ents in zip(paras, ents_per_para):
|
||
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
|
||
buf.append(masked)
|
||
rebuilt_pages.append("\n\n".join(buf))
|
||
rebuilt = "\f".join(rebuilt_pages)
|
||
|
||
# réinsérer [TABLES]
|
||
rebuilt_list = list(rebuilt)
|
||
for start, end, payload in keep:
|
||
rebuilt_list[start:end] = list(payload)
|
||
final = "".join(rebuilt_list)
|
||
return final, hits
|
||
|
||
# ----------------- FINESS Aho-Corasick establishment matching -----------------
|
||
|
||
def _build_finess_ac():
|
||
"""Construit l'automate Aho-Corasick FINESS (appelé en lazy au premier besoin)."""
|
||
global _FINESS_AC
|
||
if not _AHO_AVAILABLE:
|
||
return
|
||
data_dir = Path(__file__).parent / "data" / "finess"
|
||
dist_path = data_dir / "etablissements_distinctifs.txt"
|
||
if not dist_path.exists():
|
||
return
|
||
|
||
# Mots génériques qui ne doivent jamais être matchés seuls
|
||
_ac_generic_blacklist = {
|
||
# Types d'établissements
|
||
"clinique", "pharmacie", "hopital", "centre", "foyer",
|
||
"residence", "maison", "cabinet", "service", "laboratoire",
|
||
"institut", "association", "fondation", "mutuelle", "polyclinique",
|
||
"dispensaire", "hospice", "annexe", "antenne", "site",
|
||
# Mots français courants qui sont aussi des noms d'établissements
|
||
"collegiale", "collegial", "cathedral", "cathedrale",
|
||
"providence", "esperance", "renaissance", "liberation",
|
||
"republique", "fraternite", "solidarite", "independance",
|
||
"beauregard", "bellevue", "belvedere",
|
||
"promenade", "esplanade", "corniche", "prefecture",
|
||
"croissant", "confluence", "bienvenue",
|
||
"chartreuse", "commanderie", "chapelle", "basilique",
|
||
"departement", "departementale", "communautaire",
|
||
# Spécialités médicales / termes cliniques courants
|
||
"chirurgicale", "radiologie", "addictologie", "prevention",
|
||
"psychotherapique", "ambulatoire", "hospitalisation",
|
||
"consultation", "surveillance", "therapeutique",
|
||
"readaptation", "reeducation", "reanimation",
|
||
"specialisee", "conventionnelle", "professionnelle",
|
||
"informatique", "administrative", "regionale",
|
||
# Mots communs
|
||
"generation", "revolution", "assomption", "visitation",
|
||
"consolation", "atlantique", "manutention", "prefiguration",
|
||
"intervalle", "pharmaciens", "pharmacien", "transfert",
|
||
"comprimee", "comprimees", "injectable", "injectables",
|
||
"maintenant", "actuellement", "auparavant", "prochainement",
|
||
"rapidement", "correctement", "directement", "simplement",
|
||
"internationale", "international", "intercommunal", "intercommunale",
|
||
# Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
|
||
"resistance", "radiotherapie", "chimiotherapie", "curietherapie",
|
||
"hormonotherapie", "immunotherapie", "kinesitherapie",
|
||
"ergotherapie", "orthophonie", "psychomotricite",
|
||
"reeducation", "readaptation", "convalescence",
|
||
"dependance", "autonomie", "gerontologie",
|
||
}
|
||
# Expressions multi-mots trop génériques
|
||
_ac_generic_phrases = {
|
||
"a domicile", "au domicile", "menage a domicile",
|
||
"du nord", "du sud", "de l est", "de l ouest",
|
||
"la maison", "la residence", "les jardins",
|
||
"le village", "le parc", "la colline",
|
||
"au soleil", "en france",
|
||
# Expressions médicales homonymes d'établissements FINESS (FP relecteur 2026-03-16)
|
||
"long cours", "au long cours",
|
||
"le bourg", "le val", "le clos", "le mas",
|
||
"les pins", "les chenes", "les oliviers",
|
||
}
|
||
try:
|
||
ac = _ahocorasick.Automaton()
|
||
count = 0
|
||
for line in dist_path.read_text(encoding="utf-8").splitlines():
|
||
name = line.strip()
|
||
if not name:
|
||
continue
|
||
# Exclure les mots génériques seuls
|
||
if name in _ac_generic_blacklist:
|
||
continue
|
||
# Exclure les expressions génériques
|
||
if name in _ac_generic_phrases:
|
||
continue
|
||
words = name.split()
|
||
# Exclure les 2-mots dont le 1er est générique ET le 2e < 5 chars
|
||
if len(words) == 2 and words[0] in _ac_generic_blacklist and len(words[1]) < 5:
|
||
continue
|
||
# Exclure les noms de personnes (prenom + nom, 2 mots courts)
|
||
# Les prénoms composés "jean ...", "marie ..." ne sont pas des établissements distinctifs
|
||
_PRENOM_PREFIXES = {"jean", "marie", "louis", "pierre", "saint", "sainte"}
|
||
if len(words) == 2 and words[0] in _PRENOM_PREFIXES and len(words[1]) < 10:
|
||
continue
|
||
# Filtrer : >= 8 chars et >= 2 mots, OU >= 10 chars pour 1 mot
|
||
# Les noms courts sont gérés par RE_HOPITAL_VILLE
|
||
if len(words) >= 2 and len(name) >= 8:
|
||
# Exclure les multi-mots dont TOUS les mots sont dans le stop words médical
|
||
if all(w in _MEDICAL_STOP_WORDS_SET or len(w) <= 2 for w in words):
|
||
continue
|
||
ac.add_word(name, name)
|
||
count += 1
|
||
elif (len(words) == 1 and len(name) >= 10
|
||
and name not in _ac_generic_blacklist
|
||
and name not in _MEDICAL_STOP_WORDS_SET
|
||
and _normalize_for_matching(name) not in _MEDICAL_STOP_WORDS_SET):
|
||
ac.add_word(name, name)
|
||
count += 1
|
||
ac.make_automaton()
|
||
_FINESS_AC = ac
|
||
log.info(f"Gazetteer FINESS Aho-Corasick: {count} patterns chargés")
|
||
except Exception as e:
|
||
log.warning(f"Erreur construction FINESS Aho-Corasick: {e}")
|
||
|
||
|
||
def _normalize_positional(text: str) -> str:
|
||
"""Normalise en préservant la longueur : lowercase + accents → base char.
|
||
|
||
Chaque caractère accentué est remplacé par sa version sans accent.
|
||
Les caractères non-alphanumériques restent tels quels (même position).
|
||
Longueur de sortie == longueur d'entrée.
|
||
"""
|
||
import unicodedata
|
||
out = []
|
||
for ch in text:
|
||
# Lowercase
|
||
ch = ch.lower()
|
||
# Décomposer et retirer les accents
|
||
decomposed = unicodedata.normalize("NFD", ch)
|
||
base = "".join(c for c in decomposed if unicodedata.category(c) != "Mn")
|
||
out.append(base if base else ch)
|
||
return "".join(out)
|
||
|
||
|
||
def _mask_finess_establishments(text: str, return_matched_names: bool = False):
|
||
"""Masque les noms d'établissements FINESS détectés par Aho-Corasick.
|
||
|
||
Scanne le texte normalisé (position-preserving: même longueur) et remplace
|
||
les occurrences trouvées dans le texte original par [ETABLISSEMENT].
|
||
Seuls les matches sur des frontières de mots sont acceptés.
|
||
|
||
Si return_matched_names=True, retourne un tuple (texte_masqué, [noms_originaux]).
|
||
Sinon retourne juste le texte masqué (compatibilité ascendante).
|
||
"""
|
||
global _FINESS_AC
|
||
if _FINESS_AC is None:
|
||
_build_finess_ac()
|
||
if _FINESS_AC is None:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
normalized = _normalize_positional(text)
|
||
placeholder = PLACEHOLDERS["ETAB"]
|
||
|
||
# Collecter les matches Aho-Corasick (position fin, nom)
|
||
matches = []
|
||
for end_idx, name in _FINESS_AC.iter(normalized):
|
||
start_idx = end_idx - len(name) + 1
|
||
# Vérifier frontières de mots (pas au milieu d'un mot)
|
||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||
continue
|
||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||
continue
|
||
# Vérifier que ce n'est pas déjà dans un placeholder
|
||
ctx_before = text[max(0, start_idx - 1):start_idx]
|
||
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
|
||
if "[" in ctx_before or "]" in ctx_after:
|
||
continue
|
||
matches.append((start_idx, end_idx + 1, name))
|
||
|
||
# Filtre contextuel : exclure les matches FINESS dans un contexte anatomique/chirurgical
|
||
# ("voie biliaire principale" → "principale" est un établissement FINESS, pas ici)
|
||
_FINESS_ANATOMICAL_CTX = re.compile(
|
||
r"(?:biliaire|m[ée]sent[ée]rique|abdominale?|chirurgicale?|h[ée]patique|"
|
||
r"pulmonaire|voie|art[èe]re|veine|fistule|fracture|l[ée]sion|muqueuse|paroi|"
|
||
r"tissus?|muscle|ligament|membrane|thoracique|pelvien|pancr[ée]atique|"
|
||
r"bronchique|intestinale?|c[ée]r[ée]brale?|vasculaire|digestive?)",
|
||
re.IGNORECASE,
|
||
)
|
||
filtered_matches = []
|
||
for start, end, name in matches:
|
||
ctx_around = text[max(0, start - 50):start] + text[end:min(len(text), end + 50)]
|
||
if _FINESS_ANATOMICAL_CTX.search(ctx_around):
|
||
continue # Contexte anatomique → pas un nom d'établissement
|
||
filtered_matches.append((start, end, name))
|
||
matches = filtered_matches
|
||
|
||
if not matches:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
# Trier par position, dédupliquer (garder le plus long en cas de chevauchement)
|
||
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||
deduped = []
|
||
last_end = 0
|
||
for start, end, name in matches:
|
||
if start >= last_end:
|
||
deduped.append((start, end, name))
|
||
last_end = end
|
||
|
||
# Reconstruire le texte avec les remplacements (positions 1:1 avec l'original)
|
||
result = []
|
||
matched_names = []
|
||
last_pos = 0
|
||
for start, end, name in deduped:
|
||
if start > len(text) or end > len(text):
|
||
continue
|
||
# Extraire le texte original (avec accents, casse d'origine) pour l'audit
|
||
original_text = text[start:end]
|
||
matched_names.append(original_text)
|
||
result.append(text[last_pos:start])
|
||
result.append(placeholder)
|
||
last_pos = end
|
||
result.append(text[last_pos:])
|
||
|
||
masked = "".join(result)
|
||
return (masked, matched_names) if return_matched_names else masked
|
||
|
||
|
||
# ----------------- FINESS Address Aho-Corasick matching -----------------
|
||
|
||
|
||
def _normalize_addr_with_posmap(text: str):
|
||
"""Normalise pour matching adresses : lowercase, accents, ponctuation → espace, collapse.
|
||
|
||
Retourne (normalized, posmap) où posmap[i] = position dans le texte original du char i.
|
||
Compatible avec les patterns de adresses_finess.txt (même normalisation).
|
||
"""
|
||
import unicodedata as _ud
|
||
# Étape 1 : lowercase + strip accents, en gardant la longueur
|
||
buf = []
|
||
for ch in text:
|
||
ch_low = ch.lower()
|
||
decomposed = _ud.normalize("NFD", ch_low)
|
||
base = "".join(c for c in decomposed if _ud.category(c) != "Mn")
|
||
buf.append(base if base else ch_low)
|
||
# Étape 2 : non-alnum → espace, avec position tracking
|
||
step2 = []
|
||
for i, ch in enumerate(buf):
|
||
if ch.isalnum() or ch == ' ':
|
||
step2.append((ch, i))
|
||
else:
|
||
step2.append((' ', i))
|
||
# Étape 3 : collapse espaces multiples
|
||
result_chars = []
|
||
posmap = []
|
||
prev_space = False
|
||
for ch, orig_pos in step2:
|
||
if ch == ' ':
|
||
if not prev_space:
|
||
result_chars.append(' ')
|
||
posmap.append(orig_pos)
|
||
prev_space = True
|
||
else:
|
||
result_chars.append(ch)
|
||
posmap.append(orig_pos)
|
||
prev_space = False
|
||
# Strip leading/trailing
|
||
normalized = "".join(result_chars)
|
||
leading = len(normalized) - len(normalized.lstrip())
|
||
normalized = normalized.strip()
|
||
posmap = posmap[leading:leading + len(normalized)]
|
||
return normalized, posmap
|
||
|
||
|
||
def _build_finess_addr_ac():
|
||
"""Construit l'automate Aho-Corasick pour les adresses FINESS."""
|
||
global _FINESS_ADDR_AC
|
||
if not _AHO_AVAILABLE:
|
||
return
|
||
data_dir = Path(__file__).parent / "data" / "finess"
|
||
addr_path = data_dir / "adresses_finess.txt"
|
||
if not addr_path.exists():
|
||
return
|
||
try:
|
||
ac = _ahocorasick.Automaton()
|
||
count = 0
|
||
# Types de voie reconnus (patterns avec préfixe = plus fiables)
|
||
_voie_types = {"rue", "avenue", "boulevard", "route", "chemin", "place",
|
||
"impasse", "allee", "square", "passage", "quai", "cours",
|
||
"sentier", "rond-point", "traverse", "esplanade",
|
||
"promenade", "montee", "voie", "carrefour", "faubourg"}
|
||
# Patterns non-adresse à exclure
|
||
_addr_blacklist = {"cabinet medical", "cabinet dentaire", "cabinet infirmier",
|
||
"cabinet paramedical", "cabinet sage-femme"}
|
||
for line in addr_path.read_text(encoding="utf-8").splitlines():
|
||
name = line.strip()
|
||
if not name or len(name) < 10:
|
||
continue
|
||
if name in _addr_blacklist:
|
||
continue
|
||
words = name.split()
|
||
if all(w in _MEDICAL_STOP_WORDS_SET or len(w) <= 2 for w in words):
|
||
continue
|
||
# Patterns sans type de voie : exiger >= 20 chars (éviter noms de personnes)
|
||
has_voie_prefix = words[0] in _voie_types
|
||
if not has_voie_prefix and len(name) < 20:
|
||
continue
|
||
ac.add_word(name, name)
|
||
count += 1
|
||
ac.make_automaton()
|
||
_FINESS_ADDR_AC = ac
|
||
log.info(f"Gazetteer FINESS adresses: {count} patterns chargés")
|
||
except Exception as e:
|
||
log.warning(f"Erreur construction FINESS adresses Aho-Corasick: {e}")
|
||
|
||
|
||
def _mask_finess_addresses(text: str, return_matched_names: bool = False):
|
||
"""Masque les adresses FINESS détectées par Aho-Corasick.
|
||
|
||
Utilise une normalisation avec position-map pour gérer apostrophes, points,
|
||
et autres caractères non-alphanumériques courants dans les adresses.
|
||
"""
|
||
global _FINESS_ADDR_AC
|
||
if _FINESS_ADDR_AC is None:
|
||
_build_finess_addr_ac()
|
||
if _FINESS_ADDR_AC is None:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
normalized, posmap = _normalize_addr_with_posmap(text)
|
||
placeholder = PLACEHOLDERS.get("ADRESSE", "[ADRESSE]")
|
||
|
||
matches = []
|
||
for end_idx, name in _FINESS_ADDR_AC.iter(normalized):
|
||
start_idx = end_idx - len(name) + 1
|
||
# Vérifier frontières de mots dans le texte normalisé
|
||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||
continue
|
||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||
continue
|
||
# Mapper vers positions originales
|
||
if start_idx >= len(posmap) or end_idx >= len(posmap):
|
||
continue
|
||
orig_start = posmap[start_idx]
|
||
orig_end = posmap[end_idx] + 1
|
||
# Pas déjà dans un placeholder
|
||
ctx_before = text[max(0, orig_start - 1):orig_start]
|
||
ctx_after = text[orig_end:min(len(text), orig_end + 1)]
|
||
if "[" in ctx_before or "]" in ctx_after:
|
||
continue
|
||
matches.append((orig_start, orig_end, name))
|
||
|
||
if not matches:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
# Garder les plus longs en cas de chevauchement
|
||
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||
deduped = []
|
||
last_end = 0
|
||
for start, end, name in matches:
|
||
if start >= last_end:
|
||
deduped.append((start, end, name))
|
||
last_end = end
|
||
|
||
result = []
|
||
matched_names = []
|
||
last_pos = 0
|
||
for start, end, name in deduped:
|
||
if start > len(text) or end > len(text):
|
||
continue
|
||
original_text = text[start:end]
|
||
matched_names.append(original_text)
|
||
# Étendre vers la gauche pour capturer le numéro de voie (ex: "13, ")
|
||
ext_start = start
|
||
prefix = text[max(0, start - 15):start]
|
||
num_match = re.search(r'(\d+\s*[,.]?\s*)$', prefix)
|
||
if num_match:
|
||
ext_start = start - (len(prefix) - num_match.start())
|
||
# Étendre vers la droite pour capturer BP/CS + code postal + ville
|
||
ext_end = end
|
||
suffix = text[end:min(len(text), end + 60)]
|
||
# BP/CS + numéro + éventuel code postal + ville
|
||
bp_match = re.match(
|
||
r'(\s*(?:BP|CS)\s*\d+\s*[,.]?\s*(?:\d{5}\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-]+(?:CEDEX)?)?)',
|
||
suffix, re.IGNORECASE)
|
||
if bp_match:
|
||
ext_end = end + len(bp_match.group(1).rstrip())
|
||
result.append(text[last_pos:ext_start])
|
||
result.append(placeholder)
|
||
last_pos = ext_end
|
||
result.append(text[last_pos:])
|
||
|
||
masked = "".join(result)
|
||
return (masked, matched_names) if return_matched_names else masked
|
||
|
||
|
||
# ----------------- Ville Aho-Corasick gazetteer matching -----------------
|
||
|
||
def _build_ville_ac():
|
||
"""Construit l'automate Aho-Corasick pour les villes (INSEE + FINESS).
|
||
|
||
Appelé en lazy au premier besoin.
|
||
Les noms sont normalisés sans accents (position-preserving matching via _normalize_positional).
|
||
NOTE : on ne filtre PAS par _MEDICAL_STOP_WORDS_SET car ces villes y ont été ajoutées
|
||
pour empêcher leur détection comme NOMS DE PERSONNES, pas pour empêcher leur détection
|
||
comme villes. Le filtrage anti-faux-positifs se fait via _VILLE_BLACKLIST et le seuil
|
||
de longueur minimale.
|
||
"""
|
||
global _VILLE_AC
|
||
if not _AHO_AVAILABLE:
|
||
return
|
||
|
||
# Combiner les deux sources de villes
|
||
all_villes: set = set()
|
||
if _INSEE_COMMUNES:
|
||
all_villes.update(_INSEE_COMMUNES)
|
||
if _FINESS_VILLES:
|
||
all_villes.update(v.upper() for v in _FINESS_VILLES)
|
||
|
||
if not all_villes:
|
||
log.warning("Aucune ville disponible pour l'automate Aho-Corasick VILLE")
|
||
return
|
||
|
||
try:
|
||
ac = _ahocorasick.Automaton()
|
||
count = 0
|
||
added_normalized: set = set() # éviter les doublons après normalisation
|
||
for ville in all_villes:
|
||
ville = ville.strip()
|
||
if not ville:
|
||
continue
|
||
# Blacklist de communes ambiguës
|
||
if ville.upper() in _VILLE_BLACKLIST:
|
||
continue
|
||
# Les noms composés dans les gazetteers utilisent des espaces ("MONT DE MARSAN")
|
||
# mais dans les textes ils apparaissent souvent avec des tirets ("Mont-de-Marsan").
|
||
# On ajoute les deux variantes dans l'automate.
|
||
words = ville.split()
|
||
# Filtre longueur minimale (mono-mot < 4 chars → trop ambigu)
|
||
# Exception : quelques villes de 3 lettres notables
|
||
_VILLE_3CHAR_ALLOW = {"DAX", "PAU", "GAP", "APT", "GEX", "LUZ"}
|
||
if len(words) == 1 and len(ville) < 4 and ville.upper() not in _VILLE_3CHAR_ALLOW:
|
||
continue
|
||
# Normaliser sans accents, en lowercase (pour matching positionnel)
|
||
normalized_ville = _normalize_positional(ville)
|
||
if normalized_ville not in added_normalized:
|
||
ac.add_word(normalized_ville, (normalized_ville, ville))
|
||
added_normalized.add(normalized_ville)
|
||
count += 1
|
||
|
||
def _add_variant(variant_norm: str) -> None:
|
||
nonlocal count
|
||
if variant_norm and variant_norm not in added_normalized:
|
||
ac.add_word(variant_norm, (variant_norm, ville))
|
||
added_normalized.add(variant_norm)
|
||
count += 1
|
||
|
||
# Variante avec tirets pour les noms composés (ex: "mont de marsan" → "mont-de-marsan")
|
||
if len(words) >= 2:
|
||
_add_variant(_normalize_positional("-".join(words)))
|
||
# Variante SAINT ↔ ST (gazetteers INSEE utilisent "ST", textes "Saint")
|
||
for prefix_src, prefix_dst in [("ST ", "SAINT "), ("ST ", "SAINT-"),
|
||
("SAINT ", "ST "), ("SAINT ", "ST-"),
|
||
("STE ", "SAINTE "), ("STE ", "SAINTE-"),
|
||
("SAINTE ", "STE "), ("SAINTE ", "STE-")]:
|
||
if ville.startswith(prefix_src):
|
||
alt = prefix_dst + ville[len(prefix_src):]
|
||
_add_variant(_normalize_positional(alt))
|
||
_add_variant(_normalize_positional("-".join(alt.split())))
|
||
ac.make_automaton()
|
||
_VILLE_AC = ac
|
||
log.info(f"Gazetteer VILLE Aho-Corasick: {count} patterns chargés "
|
||
f"(INSEE: {len(_INSEE_COMMUNES)}, FINESS: {len(_FINESS_VILLES)})")
|
||
except Exception as e:
|
||
log.warning(f"Erreur construction VILLE Aho-Corasick: {e}")
|
||
|
||
|
||
def _mask_ville_gazetteers(text: str) -> tuple:
|
||
"""Masque les villes détectées par Aho-Corasick dans le texte narratif.
|
||
|
||
Stratégie contextuelle : pour éviter les faux positifs massifs (CHARGE, SIGNES,
|
||
TALON — communes homonymes de mots courants), on ne masque une ville que si :
|
||
- C'est une ville composée (Saint-Palais), OU
|
||
- C'est une ville très longue (>= 8 lettres : Bordeaux, Toulouse), OU
|
||
- Elle apparaît dans un contexte géographique explicite (à, de, vers, habite, etc.)
|
||
|
||
Returns: (texte_masqué, liste_des_valeurs_originales_masquées)
|
||
"""
|
||
global _VILLE_AC
|
||
if _VILLE_AC is None:
|
||
_build_ville_ac()
|
||
if _VILLE_AC is None:
|
||
return text
|
||
|
||
normalized = _normalize_positional(text)
|
||
placeholder = PLACEHOLDERS["VILLE"]
|
||
|
||
# Contextes géographiques avant une ville
|
||
# NOTE : "de" seul est trop ambigu ("prise de selles", "nombre de jumeaux")
|
||
# On exige "de" uniquement après un verbe/nom géographique ou une préposition composée
|
||
_RE_GEO_BEFORE = re.compile(
|
||
r"(?:"
|
||
# Préposition "à" (très spécifique géographiquement)
|
||
r"[àÀ]\s+|"
|
||
# "de" seulement dans un contexte géographique (vient de, originaire de, etc.)
|
||
r"(?:vient|venant|arrivant|provenant|originaire|issu(?:e)?)\s+(?:de\s+|d['']\s*)|"
|
||
# "urgences de", "hôpital de", "clinique de", "UHCD de", etc.
|
||
r"(?:urgences?|h[oô]pital|clinique|CHU?|CH\b|UHCD|SSR|USLD|HAD|EHPAD|CSAPA|CMPP|CMP|GHT|HIA)\s+(?:de\s+|d['']\s*)|"
|
||
# Verbes de localisation directement suivis de la ville
|
||
r"(?:habit|résid|viv|domicilié(?:e)?|transféré(?:e)?|"
|
||
r"adressé(?:e)?|hospitalisé(?:e)?|opéré(?:e)?|"
|
||
r"Fait)\s+(?:à\s+|de\s+|d['']\s*)?|"
|
||
# "vers" (prép. géo directe) — NOTE: "sur" exclu car trop ambigu ("sur le plan")
|
||
r"vers\s+|"
|
||
# Après code postal ou parenthèse ouvrante (adresse)
|
||
r"\[CODE_POSTAL\]\s*|"
|
||
r"\(\s*|"
|
||
# Contextes médicaux spécifiques d'adressage
|
||
r"(?:urg(?:ences?)?\s+)|"
|
||
# Après titre médical + nom masqué + tiret/virgule : "Dr [NOM] - VILLE"
|
||
r"(?:Dr\.?|Pr\.?|Docteur|Professeur)\s+\[NOM\]\s*[\-–,]\s*|"
|
||
r"\[NOM\]\s*[\-–,(]\s*|"
|
||
# Après spécialité médicale : "cardiologue Anglet", "neurologue, DAX"
|
||
r"(?:cardiologue|neurologue|radiologue|chirurgien|pneumologue|"
|
||
r"gastro-?ent[ée]rologue|oncologue|n[ée]phrologue|urologue|"
|
||
r"g[ée]riatre|dermatologue|rhumatologue|ophtalmologue|psychiatre|"
|
||
r"anesth[ée]siste|gyn[ée]cologue|p[ée]diatre|m[ée]decin|g[ée]n[ée]raliste|"
|
||
r"kin[ée]|ORL|MT)\s*[,]?\s*"
|
||
r")\s*$",
|
||
re.I,
|
||
)
|
||
|
||
# Collecter les matches Aho-Corasick
|
||
matches = []
|
||
for end_idx, (norm_name, orig_name) in _VILLE_AC.iter(normalized):
|
||
start_idx = end_idx - len(norm_name) + 1
|
||
# Vérifier frontières de mots (pas au milieu d'un mot)
|
||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||
continue
|
||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||
continue
|
||
# Vérifier que ce n'est pas déjà dans un placeholder [...]
|
||
ctx_before = text[max(0, start_idx - 1):start_idx]
|
||
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
|
||
if "[" in ctx_before or "]" in ctx_after:
|
||
continue
|
||
# Vérifier proximité placeholder (pas juste après [ETABLISSEMENT] de ...)
|
||
wide_before = text[max(0, start_idx - 25):start_idx]
|
||
if re.search(r"\[(VILLE|ADRESSE|ETABLISSEMENT)\]\s*(?:de\s+|du\s+|d['']\s*|à\s+)?$", wide_before):
|
||
continue
|
||
# Récupérer le texte original à cette position
|
||
original_span = text[start_idx:end_idx + 1]
|
||
word_count = len(orig_name.split())
|
||
word_len = len(orig_name.strip())
|
||
# Stratégie contextuelle pour éviter les FP :
|
||
# TOUJOURS exiger un contexte géographique (à, de, vers, habite, etc.)
|
||
# sauf pour les villes composées avec trait d'union (Saint-Palais,
|
||
# Mont-de-Marsan) qui sont très peu ambiguës.
|
||
is_compound_hyphen = ("-" in original_span and word_count >= 2)
|
||
if not is_compound_hyphen:
|
||
before_ctx = text[max(0, start_idx - 40):start_idx]
|
||
if not _RE_GEO_BEFORE.search(before_ctx):
|
||
continue
|
||
matches.append((start_idx, end_idx + 1, original_span))
|
||
|
||
if not matches:
|
||
return text, []
|
||
|
||
# Dédupliquer : préférer le match le plus long en cas de chevauchement
|
||
# Trier par longueur décroissante, puis sélectionner gloutonement les non-chevauchants
|
||
matches.sort(key=lambda x: -(x[1] - x[0]))
|
||
deduped = []
|
||
for start, end, orig in matches:
|
||
# Vérifier que cet intervalle ne chevauche pas un intervalle déjà retenu
|
||
if any(s < end and start < e for s, e, _ in deduped):
|
||
continue
|
||
deduped.append((start, end, orig))
|
||
# Re-trier par position pour la reconstruction
|
||
deduped.sort(key=lambda x: x[0])
|
||
|
||
# Reconstruire le texte avec les remplacements
|
||
result = []
|
||
masked_originals = []
|
||
last_pos = 0
|
||
for start, end, orig in deduped:
|
||
if start > len(text) or end > len(text):
|
||
continue
|
||
result.append(text[last_pos:start])
|
||
result.append(placeholder)
|
||
masked_originals.append(orig)
|
||
last_pos = end
|
||
result.append(text[last_pos:])
|
||
|
||
return "".join(result), masked_originals
|
||
|
||
|
||
# ----------------- Selective safety rescan -----------------
|
||
|
||
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||
"""Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage."""
|
||
# enlève TABLES du scope
|
||
def strip_tables(s: str):
|
||
kept = []
|
||
out = []
|
||
i = 0
|
||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||
for m in pattern.finditer(s):
|
||
out.append(s[i:m.start()])
|
||
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
|
||
out.append("\x00" * (m.end() - m.start()))
|
||
i = m.end()
|
||
out.append(s[i:])
|
||
return "".join(out), kept
|
||
protected, kept = strip_tables(text)
|
||
# PII critiques (comme avant)
|
||
# IPP et N° examen AVANT TEL pour éviter collision de labels
|
||
protected = RE_IPP.sub(lambda m: f"IPP : {PLACEHOLDERS['IPP']}", protected)
|
||
protected = RE_NUM_EXAMEN_PATIENT.sub(
|
||
lambda m: m.group(0).replace(m.group(1), PLACEHOLDERS["DOSSIER"]), protected
|
||
)
|
||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||
protected = RE_URL.sub(PLACEHOLDERS["MASK"], protected)
|
||
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
|
||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
|
||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||
# NIR avec validation
|
||
def _rescan_nir(m: re.Match) -> str:
|
||
return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0)
|
||
protected = RE_NIR.sub(_rescan_nir, protected)
|
||
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
|
||
protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected)
|
||
# protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé
|
||
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||
protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
|
||
# N° Episode
|
||
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
|
||
# N° venue / séjour
|
||
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
|
||
# N° RPPS
|
||
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
|
||
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
|
||
if _FINESS_NUMBERS:
|
||
def _rescan_finess(m: re.Match) -> str:
|
||
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
|
||
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
|
||
# Établissements (regex)
|
||
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
||
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
||
# Établissements (gazetteer Aho-Corasick FINESS — 116K noms distinctifs)
|
||
protected = _mask_finess_establishments(protected)
|
||
# Adresses (gazetteer Aho-Corasick FINESS — 28K noms de voie)
|
||
protected = _mask_finess_addresses(protected)
|
||
# Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R" → [ETABLISSEMENT]
|
||
_re_spaced = re.compile(r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]')
|
||
_spaced_kw = {"HOSPITALIER", "HOSPITALIERE", "HOSPITALIERES", "HOSPITALIERS",
|
||
"CLINIQUE", "HOPITAL", "HÔPITAL", "POLYCLINIQUE",
|
||
"CENTRE", "ETABLISSEMENT", "MAISON", "RESIDENCE",
|
||
"EHPAD", "SSR", "USLD", "CHU", "CHRU"}
|
||
for m_sp in _re_spaced.finditer(protected):
|
||
collapsed = m_sp.group(0).replace(" ", "").upper()
|
||
if any(kw in collapsed for kw in _spaced_kw):
|
||
protected = protected.replace(m_sp.group(0), PLACEHOLDERS["ETAB"], 1)
|
||
# Villes (gazetteer Aho-Corasick — INSEE + FINESS)
|
||
if _VILLE_AC is None:
|
||
_build_ville_ac()
|
||
if _VILLE_AC is not None:
|
||
protected, _ = _mask_ville_gazetteers(protected)
|
||
# Services hospitaliers
|
||
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
|
||
# Lieu de naissance / Ville de résidence (accepte tout : villes, codes INSEE, minuscules)
|
||
_re_lieu_rescan = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)(\S.+)")
|
||
protected = _re_lieu_rescan.sub(lambda m: m.group(1) + PLACEHOLDERS["VILLE"], protected)
|
||
_re_ville_rescan = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)(\S.+)")
|
||
protected = _re_ville_rescan.sub(lambda m: m.group(1) + PLACEHOLDERS["VILLE"], protected)
|
||
# Personnes contextuelles (avec whitelist)
|
||
wl_sections = set()
|
||
wl_phrases = set()
|
||
if cfg:
|
||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||
def _rescan_person(m: re.Match) -> str:
|
||
span = m.group(1).strip(); raw = m.group(0)
|
||
if span in wl_sections or raw in wl_phrases:
|
||
return raw
|
||
tokens = [t for t in span.split() if t]
|
||
if len(tokens) == 1 and len(tokens[0]) <= 4:
|
||
return raw
|
||
# Filtrer les termes médicaux (stop words)
|
||
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
|
||
if not clean:
|
||
return raw
|
||
return raw.replace(span, PLACEHOLDERS["NOM"])
|
||
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
||
# Mr/Mme + initiale isolée : "Mme Z", "Mr R" → masquer
|
||
protected = RE_CIVILITE_INITIALE.sub(
|
||
lambda m: m.group(1) + PLACEHOLDERS["NOM"], protected
|
||
)
|
||
# Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
|
||
_re_init_nom = re.compile(r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.[\s\-]*(\[NOM\])')
|
||
protected = _re_init_nom.sub(r'[NOM] \2', protected)
|
||
# Références initiales : "Ref : JF/VA" → "Ref : [NOM]/[NOM]"
|
||
_re_ref_init = re.compile(r'(?:Ref\s*:\s*|Réf\s*:\s*)([A-Z]{1,3})\s*/\s*([A-Z]{1,3})\b')
|
||
protected = _re_ref_init.sub(
|
||
lambda m: m.group(0)[:m.group(0).index(m.group(1))] + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"],
|
||
protected,
|
||
)
|
||
res = list(protected)
|
||
for start, end, payload in kept:
|
||
res[start:end] = list(payload)
|
||
return "".join(res)
|
||
|
||
# ----------------- PDF Redaction -----------------
|
||
|
||
def _search_ocr_words_fuzzy_digits(ocr_words: List[Tuple[str, float, float, float, float]],
|
||
token: str, page_rect, min_ratio: float = 0.7) -> list:
|
||
"""Matching flou pour identifiants numériques manuscrits.
|
||
Compare les séquences de chiffres entre le token VLM et les mots OCR.
|
||
Accepte une correspondance si ≥ min_ratio des chiffres matchent."""
|
||
token_digits = re.sub(r"[^0-9]", "", token)
|
||
if len(token_digits) < 4:
|
||
return []
|
||
rects = []
|
||
for (word, x0n, y0n, x1n, y1n) in ocr_words:
|
||
word_digits = re.sub(r"[^0-9]", "", word)
|
||
if len(word_digits) < 3:
|
||
continue
|
||
# Match exact des chiffres (après nettoyage)
|
||
if word_digits == token_digits:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width, y0n * page_rect.height,
|
||
x1n * page_rect.width, y1n * page_rect.height,
|
||
))
|
||
continue
|
||
# Match partiel : le token est contenu dans le mot OCR ou vice-versa
|
||
if token_digits in word_digits or word_digits in token_digits:
|
||
if min(len(token_digits), len(word_digits)) / max(len(token_digits), len(word_digits)) >= min_ratio:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width, y0n * page_rect.height,
|
||
x1n * page_rect.width, y1n * page_rect.height,
|
||
))
|
||
continue
|
||
# Match par distance : comparer caractère par caractère (Hamming-like)
|
||
if abs(len(word_digits) - len(token_digits)) <= 2:
|
||
shorter, longer = (word_digits, token_digits) if len(word_digits) <= len(token_digits) else (token_digits, word_digits)
|
||
matches = sum(1 for a, b in zip(shorter, longer) if a == b)
|
||
if matches / len(longer) >= min_ratio:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width, y0n * page_rect.height,
|
||
x1n * page_rect.width, y1n * page_rect.height,
|
||
))
|
||
return rects
|
||
|
||
def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], token: str, page_rect) -> list:
|
||
"""Cherche un token dans les mots OCR d'une page.
|
||
Pour les tokens multi-mots, cherche chaque mot individuellement.
|
||
Retourne des fitz.Rect en coordonnées PDF points."""
|
||
rects = []
|
||
tokens_to_search = token.split() if " " in token else [token]
|
||
for t in tokens_to_search:
|
||
t_lower = t.lower().strip()
|
||
if not t_lower:
|
||
continue
|
||
for (word, x0n, y0n, x1n, y1n) in ocr_words:
|
||
if word.lower().strip(".,;:!?()") == t_lower:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width,
|
||
y0n * page_rect.height,
|
||
x1n * page_rect.width,
|
||
y1n * page_rect.height,
|
||
))
|
||
return rects
|
||
|
||
def _search_whole_word(page, token: str) -> list:
|
||
"""Cherche un token comme mot entier (pas substring) via get_text('words').
|
||
Évite les faux positifs de page.search_for() qui fait du substring matching.
|
||
Gère les noms composés (JEAN-PIERRE) qui peuvent être splittés par le PDF."""
|
||
rects = []
|
||
token_lower = token.lower().strip()
|
||
words = page.get_text("words")
|
||
for w in words:
|
||
# w = (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||
if word_text.lower() == token_lower:
|
||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||
# Fallback pour noms composés avec tiret (JEAN-PIERRE) splittés par le PDF
|
||
if not rects and "-" in token:
|
||
parts = [p for p in token.split("-") if p]
|
||
if len(parts) >= 2:
|
||
for p in parts:
|
||
p_lower = p.lower().strip()
|
||
if len(p_lower) < 2:
|
||
continue
|
||
for w in words:
|
||
wt = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||
if wt.lower() == p_lower:
|
||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||
return rects
|
||
|
||
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, ocr_word_map: OcrWordMap = None) -> None:
|
||
if fitz is None:
|
||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||
doc = fitz.open(str(original_pdf))
|
||
# index hits par page; page==-1 → rechercher sur toutes pages
|
||
by_page: Dict[int, List[PiiHit]] = {}
|
||
for h in audit:
|
||
by_page.setdefault(h.page, []).append(h)
|
||
# Kinds à ne pas chercher dans le PDF (dates masquées uniquement dans le texte,
|
||
# pas dans le PDF où elles rendent les tableaux illisibles)
|
||
_VECTOR_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||
# Kinds sensibles au substring matching : utiliser _search_whole_word
|
||
_VECTOR_WHOLEWORD_KINDS = {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
|
||
"ETAB_FINESS", "ADDR_FINESS", "NER_PER", "NER_ORG", "NER_LOC"}
|
||
for pno in range(len(doc)):
|
||
page = doc[pno]
|
||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||
if not hits:
|
||
continue
|
||
# Dédupliquer les tokens : (token, kind) → rechercher une seule fois par page
|
||
seen_tokens: set = set()
|
||
all_rects = []
|
||
for h in hits:
|
||
token = h.original.strip()
|
||
if not token:
|
||
continue
|
||
if h.kind in _VECTOR_SKIP_KINDS:
|
||
continue
|
||
# Clé de déduplication : le token lui-même (même token cherché une seule fois)
|
||
dedup_key = token
|
||
if dedup_key in seen_tokens:
|
||
continue
|
||
seen_tokens.add(dedup_key)
|
||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||
if h.kind in _VECTOR_WHOLEWORD_KINDS:
|
||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
if " " not in token:
|
||
rects = _search_whole_word(page, token)
|
||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
all_rects.extend(rects)
|
||
else:
|
||
# Vérification frontières de mots (comme raster)
|
||
rects = page.search_for(token)
|
||
if rects:
|
||
page_text = page.get_text()
|
||
import re as _re
|
||
if not _re.search(r"(?<![A-Za-zÀ-ÿ])" + _re.escape(token) + r"(?![A-Za-zÀ-ÿ])",
|
||
page_text, _re.IGNORECASE):
|
||
rects = []
|
||
if not rects:
|
||
for word in token.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 4 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
rects.extend(_search_whole_word(page, word))
|
||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||
rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||
all_rects.extend(rects)
|
||
continue
|
||
rects = page.search_for(token)
|
||
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||
compact = re.sub(r"\s+", "", token)
|
||
if compact != token:
|
||
rects = page.search_for(compact)
|
||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
all_rects.extend(rects)
|
||
# Appliquer toutes les annotations d'un coup (évite de ralentir search_for)
|
||
for r in all_rects:
|
||
page.add_redact_annot(r, fill=(0, 0, 0))
|
||
try:
|
||
page.apply_redactions()
|
||
except Exception:
|
||
pass
|
||
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
|
||
doc.close()
|
||
|
||
|
||
def _rasterize_page(args):
|
||
"""Worker parallèle : rasterise une page + dessine les rectangles noirs."""
|
||
pdf_path_str, pno, rects_tuples, dpi, ogc_label, jpeg_quality, image_rects_tuples = args
|
||
doc = fitz.open(pdf_path_str)
|
||
src = doc[pno]
|
||
rect_w, rect_h = src.rect.width, src.rect.height
|
||
zoom = dpi / 72.0
|
||
pix = src.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
draw = ImageDraw.Draw(img)
|
||
shrink = 1.5
|
||
for (x0, y0, x1, y1) in rects_tuples:
|
||
rx0 = x0 * zoom + shrink
|
||
ry0 = y0 * zoom
|
||
rx1 = x1 * zoom - shrink
|
||
ry1 = y1 * zoom
|
||
if rx1 > rx0:
|
||
draw.rectangle([rx0, ry0, rx1, ry1], fill=(0, 0, 0))
|
||
# Noircir les images embarquées (logos, signatures, captures d'écran)
|
||
for (x0, y0, x1, y1) in image_rects_tuples:
|
||
rx0 = x0 * zoom
|
||
ry0 = y0 * zoom
|
||
rx1 = x1 * zoom
|
||
ry1 = y1 * zoom
|
||
draw.rectangle([rx0, ry0, rx1, ry1], fill=(0, 0, 0))
|
||
# Détecter et noircir les codes-barres et QR codes
|
||
try:
|
||
from pyzbar.pyzbar import decode as _pyzbar_decode
|
||
for symbol in _pyzbar_decode(img):
|
||
r = symbol.rect
|
||
margin = int(5 * zoom)
|
||
draw.rectangle([r.left - margin, r.top - margin,
|
||
r.left + r.width + margin, r.top + r.height + margin],
|
||
fill=(0, 0, 0))
|
||
except Exception:
|
||
pass # pyzbar optionnel
|
||
if ogc_label:
|
||
from PIL import ImageFont
|
||
font_size = int(14 * zoom)
|
||
try:
|
||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
|
||
except Exception:
|
||
font = ImageFont.load_default()
|
||
text = ogc_label if ogc_label.upper().startswith("OGC") else f"OGC: {ogc_label}"
|
||
bbox = draw.textbbox((0, 0), text, font=font)
|
||
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
||
margin = int(10 * zoom)
|
||
x = img.width - tw - margin
|
||
y = margin
|
||
draw.rectangle([x - 4, y - 2, x + tw + 4, y + th + 2], fill=(255, 255, 255))
|
||
draw.text((x, y), text, fill=(0, 0, 0), font=font)
|
||
buf = io.BytesIO()
|
||
if jpeg_quality and jpeg_quality > 0:
|
||
img.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
|
||
else:
|
||
img.save(buf, format="PNG")
|
||
doc.close()
|
||
return pno, buf.getvalue(), rect_w, rect_h
|
||
|
||
|
||
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 120, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None, jpeg_quality: int = 80) -> None:
|
||
if fitz is None:
|
||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||
doc = fitz.open(str(original_pdf))
|
||
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||
_RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||
# Kinds sensibles au substring matching : utiliser _search_whole_word
|
||
_RASTER_WHOLEWORD_KINDS = {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
|
||
"ETAB_FINESS", "ADDR_FINESS", "NER_PER", "NER_ORG", "NER_LOC"}
|
||
_VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA",
|
||
"VLM_NIR", "VLM_IPP", "VLM_RPPS"}
|
||
by_page: Dict[int, List[PiiHit]] = {}
|
||
for h in audit:
|
||
by_page.setdefault(h.page, []).append(h)
|
||
for pno in range(len(doc)):
|
||
page = doc[pno]
|
||
rects = []
|
||
seen_tokens: set = set()
|
||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||
# Masquage total si FULL_PAGE_MASK détecté (page manuscrite non déchiffrable)
|
||
if any(h.kind == "FULL_PAGE_MASK" and h.page == pno for h in hits):
|
||
margin = 5 # points — liseré fin autour du masque
|
||
rects.append(fitz.Rect(margin, margin, page.rect.width - margin, page.rect.height - margin))
|
||
all_rects[pno] = rects
|
||
continue
|
||
for h in hits:
|
||
token = h.original.strip()
|
||
if not token or h.kind in _RASTER_SKIP_KINDS:
|
||
continue
|
||
# Ignorer les tokens trop courts (initiales isolées, "N", "S", "de")
|
||
# qui génèrent des FP en matchant dans les labels DPI
|
||
if len(token) < 3 and h.kind in _RASTER_WHOLEWORD_KINDS:
|
||
continue
|
||
if token in seen_tokens:
|
||
continue
|
||
seen_tokens.add(token)
|
||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||
if h.kind in _RASTER_WHOLEWORD_KINDS:
|
||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
if " " not in token:
|
||
# Token mono-mot : chercher comme mot entier
|
||
found_ww = _search_whole_word(page, token)
|
||
if not found_ww and ocr_word_map and pno in ocr_word_map:
|
||
found_ww = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
rects.extend(found_ww)
|
||
else:
|
||
# Token multi-mots : chercher la chaîne complète puis vérifier
|
||
# les frontières de mots pour éviter le substring matching
|
||
# (ex: "TATINE EG" trouvé dans "ATORVASTATINE EG")
|
||
found_multi = page.search_for(token)
|
||
if found_multi:
|
||
# Vérifier que le match est sur des frontières de mots
|
||
page_text = page.get_text()
|
||
verified = []
|
||
for rect in found_multi:
|
||
# Extraire le texte autour du match pour vérifier les limites
|
||
# Chercher le token dans le texte brut avec \b
|
||
import re as _re
|
||
if _re.search(r"(?<![A-Za-zÀ-ÿ])" + _re.escape(token) + r"(?![A-Za-zÀ-ÿ])",
|
||
page_text, _re.IGNORECASE):
|
||
verified.append(rect)
|
||
found_multi = verified
|
||
if not found_multi:
|
||
# Fallback : chercher chaque mot comme mot entier
|
||
for word in token.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 4 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
found_multi.extend(_search_whole_word(page, word))
|
||
if not found_multi and ocr_word_map and pno in ocr_word_map:
|
||
found_multi.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||
rects.extend(found_multi)
|
||
continue
|
||
found = page.search_for(token)
|
||
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
||
compact = re.sub(r"\s+", "", token)
|
||
found = page.search_for(compact)
|
||
if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
||
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
|
||
for word in token.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
found.extend(page.search_for(word))
|
||
# Fallback OCR pour chaque mot
|
||
if not found and ocr_word_map and pno in ocr_word_map:
|
||
found.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||
if not found and ocr_word_map and pno in ocr_word_map:
|
||
found = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
# Matching flou pour identifiants numériques VLM (manuscrit)
|
||
if not found and h.kind in _VLM_NUMERIC_KINDS and ocr_word_map and pno in ocr_word_map:
|
||
found = _search_ocr_words_fuzzy_digits(ocr_word_map[pno], token, page.rect)
|
||
rects.extend(found)
|
||
all_rects[pno] = rects
|
||
|
||
# Phase 2 : rasterisation parallèle (ProcessPoolExecutor)
|
||
n_pages = len(doc)
|
||
rects_as_tuples = {
|
||
pno: [(r.x0, r.y0, r.x1, r.y1) for r in rects]
|
||
for pno, rects in all_rects.items()
|
||
}
|
||
# Extraire les positions des images embarquées (logos, captures d'écran, etc.)
|
||
# Filtre : images > 30x30 points (ignorer icônes/puces)
|
||
# MAIS exclure les images pleine page (> 70% surface page) = documents scannés
|
||
_MIN_IMG_DIM = 30
|
||
_MAX_PAGE_COVERAGE = 0.70 # au-delà = image du document scanné, ne pas noircir
|
||
image_rects_by_page: Dict[int, list] = {}
|
||
for pno in range(n_pages):
|
||
page = doc[pno]
|
||
page_area = page.rect.width * page.rect.height
|
||
img_rects = []
|
||
for img_item in page.get_images(full=True):
|
||
xref = img_item[0]
|
||
try:
|
||
for r in page.get_image_rects(xref):
|
||
if r.is_empty or r.is_infinite:
|
||
continue
|
||
if r.width >= _MIN_IMG_DIM and r.height >= _MIN_IMG_DIM:
|
||
# Exclure les images pleine page (document scanné)
|
||
img_area = r.width * r.height
|
||
if page_area > 0 and img_area / page_area > _MAX_PAGE_COVERAGE:
|
||
continue
|
||
img_rects.append((r.x0, r.y0, r.x1, r.y1))
|
||
except Exception:
|
||
continue
|
||
image_rects_by_page[pno] = img_rects
|
||
doc.close() # fermer AVANT le fork
|
||
|
||
n_workers = min(n_pages, os.cpu_count() or 4)
|
||
tasks = [
|
||
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label, jpeg_quality,
|
||
image_rects_by_page.get(pno, []))
|
||
for pno in range(n_pages)
|
||
]
|
||
|
||
with ProcessPoolExecutor(max_workers=n_workers) as pool:
|
||
results = sorted(pool.map(_rasterize_page, tasks), key=lambda x: x[0])
|
||
|
||
# Assemblage final (séquentiel, rapide)
|
||
out = fitz.open()
|
||
for pno, img_bytes, w, h in results:
|
||
dst = out.new_page(width=w, height=h)
|
||
dst.insert_image(fitz.Rect(0, 0, w, h), stream=img_bytes)
|
||
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||
out.close()
|
||
|
||
# ----------------- VLM pour PDFs scannés -----------------
|
||
|
||
def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: OcrWordMap, vlm_manager) -> None:
|
||
"""Utilise un VLM (Ollama) pour détecter visuellement les PII sur chaque page d'un PDF scanné.
|
||
Les entités détectées sont ajoutées à anon.audit et au texte pseudonymisé.
|
||
Auto-rotation : si une page a peu de mots OCR, essaie 4 orientations."""
|
||
from vlm_manager import VLM_CATEGORY_MAP
|
||
doc = fitz.open(str(pdf_path))
|
||
# Collecter les PII déjà détectés pour contexte VLM
|
||
existing_pii = list({h.original.strip() for h in anon.audit if h.original.strip()})
|
||
|
||
# Catégories contenant des identifiants numériques (matching flou)
|
||
_NUMERIC_CATS = {"NUMERO_PATIENT", "NUMERO_LOT", "NUMERO_ORDONNANCE", "NUMERO_SEJOUR",
|
||
"NDA", "NIR", "IPP", "RPPS"}
|
||
# Catégories à splitter en mots (noms, services, établissements)
|
||
_SPLIT_CATS = {"NOM", "PRENOM", "ETABLISSEMENT", "SERVICE"}
|
||
|
||
for pno in range(len(doc)):
|
||
pix = doc[pno].get_pixmap(dpi=150)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
|
||
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
||
ocr_count = len(ocr_word_map.get(pno, []))
|
||
is_handwritten_page = ocr_count < 100
|
||
|
||
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
|
||
if is_handwritten_page and ocr_count > 0:
|
||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||
placeholder=PLACEHOLDERS["MASK"]))
|
||
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
|
||
continue
|
||
|
||
# Pages lisibles : analyse VLM
|
||
best_entities = []
|
||
try:
|
||
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
|
||
existing_pii=existing_pii[:20])
|
||
except Exception:
|
||
best_entities = []
|
||
|
||
for ent in best_entities:
|
||
cat = ent.get("categorie", "").upper()
|
||
texte = ent.get("texte", "").strip()
|
||
conf = ent.get("confiance", 0.0)
|
||
if not texte or conf < 0.3:
|
||
continue
|
||
if cat not in VLM_CATEGORY_MAP:
|
||
continue
|
||
kind, placeholder_key = VLM_CATEGORY_MAP[cat]
|
||
placeholder = PLACEHOLDERS.get(placeholder_key, PLACEHOLDERS["MASK"])
|
||
|
||
if cat in _SPLIT_CATS:
|
||
# Splitter en mots pour meilleur matching OCR
|
||
for word in texte.split():
|
||
word = word.strip(" .-'(),")
|
||
if len(word) < 2 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
anon.audit.append(PiiHit(page=pno, kind=kind, original=word, placeholder=placeholder))
|
||
else:
|
||
anon.audit.append(PiiHit(page=pno, kind=kind, original=texte, placeholder=placeholder))
|
||
# Pour les identifiants numériques, ajouter aussi le token nettoyé (chiffres seuls)
|
||
if cat in _NUMERIC_CATS:
|
||
digits_only = re.sub(r"[^0-9]", "", texte)
|
||
if digits_only and digits_only != texte:
|
||
anon.audit.append(PiiHit(page=pno, kind=kind, original=digits_only, placeholder=placeholder))
|
||
|
||
# Remplacer dans le texte pseudonymisé si trouvé
|
||
try:
|
||
anon.text_out = re.sub(rf"\b{re.escape(texte)}\b", placeholder, anon.text_out)
|
||
except re.error:
|
||
anon.text_out = anon.text_out.replace(texte, placeholder)
|
||
|
||
doc.close()
|
||
|
||
|
||
# ----------------- Orchestration -----------------
|
||
|
||
def process_pdf(
|
||
pdf_path: Path,
|
||
out_dir: Path,
|
||
make_vector_redaction: bool = True,
|
||
also_make_raster_burn: bool = False,
|
||
config_path: Optional[Path] = None,
|
||
use_hf: bool = False,
|
||
ner_manager=None,
|
||
ner_thresholds=None,
|
||
ogc_label: Optional[str] = None,
|
||
vlm_manager=None,
|
||
gliner_manager=None,
|
||
camembert_manager=None,
|
||
) -> Dict[str, str]:
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
cfg = load_dictionaries(config_path)
|
||
pages_text, tables_lines, ocr_used, ocr_word_map = extract_text_with_fallback_ocr(pdf_path)
|
||
|
||
# 1) Regex rules
|
||
anon = anonymise_document_regex(pages_text, tables_lines, cfg)
|
||
|
||
# 1b) VLM (optionnel) — sur les PDFs scannés uniquement
|
||
if ocr_used and vlm_manager is not None and VlmManager is not None:
|
||
try:
|
||
if vlm_manager.is_loaded():
|
||
_apply_vlm_on_scanned_pdf(pdf_path, anon, ocr_word_map, vlm_manager)
|
||
except Exception:
|
||
pass # dégradation gracieuse
|
||
|
||
# 2) NER (optionnel) — sur le narratif
|
||
final_text = anon.text_out
|
||
hf_hits: List[PiiHit] = []
|
||
if use_hf and ner_manager is not None and ner_manager.is_loaded():
|
||
# Détecter le type de manager et appeler la bonne fonction
|
||
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
|
||
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager, camembert_mgr=camembert_manager)
|
||
else:
|
||
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
|
||
anon.audit.extend(hf_hits)
|
||
|
||
# 3) Rescan selectif
|
||
final_text = selective_rescan(final_text, cfg=cfg)
|
||
|
||
# 3b) Nettoyage post-masquage : codes postaux orphelins (5 chiffres collés à un placeholder)
|
||
# et téléphones fragmentés sur plusieurs lignes
|
||
_re_cp_orphan = re.compile(r"(\[(?:ADRESSE|NOM|VILLE)\])\s*(\d{5})\b")
|
||
def _clean_cp_orphan(m):
|
||
anon.audit.append(PiiHit(-1, "CODE_POSTAL", m.group(2), PLACEHOLDERS["CODE_POSTAL"]))
|
||
return m.group(1) + PLACEHOLDERS["CODE_POSTAL"]
|
||
final_text = _re_cp_orphan.sub(_clean_cp_orphan, final_text)
|
||
|
||
# Téléphones fragmentés : "0X XX XX XX\nXX" coupé en fin de ligne (ligne suivante immédiate)
|
||
_re_tel_frag = re.compile(r"((?:\+33\s?|0)\d(?:[ .-]?\d){6,7})\s*\n\s*(\d{2}(?!\d))")
|
||
def _clean_tel_frag(m):
|
||
full = m.group(1).replace(" ", "").replace(".", "").replace("-", "") + m.group(2)
|
||
if len(full.replace("+33", "0")) == 10:
|
||
anon.audit.append(PiiHit(-1, "TEL", m.group(0).strip(), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"] + "\n"
|
||
return m.group(0)
|
||
final_text = _re_tel_frag.sub(_clean_tel_frag, final_text)
|
||
|
||
# Téléphones incomplets en fin de ligne (8 ou 9 chiffres au format 0X XX XX XX) : masquer la partie visible
|
||
_re_tel_partial = re.compile(r"(?<!\d)((?:\+33\s?|0)\d(?:[ .-]?\d){5,7})(?!\d)\s*$", re.MULTILINE)
|
||
def _clean_tel_partial(m):
|
||
digits = re.sub(r"[ .\-]", "", m.group(1))
|
||
if 8 <= len(digits) <= 9:
|
||
anon.audit.append(PiiHit(-1, "TEL", m.group(0).strip(), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"]
|
||
return m.group(0)
|
||
final_text = _re_tel_partial.sub(_clean_tel_partial, final_text)
|
||
|
||
# 3c) Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
|
||
_RE_INITIAL_BEFORE_NOM = re.compile(
|
||
r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.[\s\-]*(\[NOM\])'
|
||
)
|
||
def _clean_initial_before_nom(m):
|
||
anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(1) + ".", PLACEHOLDERS["NOM"]))
|
||
return PLACEHOLDERS["NOM"] + " " + m.group(2)
|
||
final_text = _RE_INITIAL_BEFORE_NOM.sub(_clean_initial_before_nom, final_text)
|
||
|
||
# 3d) Références initiales : "Ref : JF/VA", "Réf : AD/EP" → "Ref : [NOM]/[NOM]"
|
||
_RE_REF_INITIALS = re.compile(
|
||
r'(?:Ref\s*:\s*|Réf\s*:\s*)([A-Z]{1,3})\s*/\s*([A-Z]{1,3})\b'
|
||
)
|
||
def _clean_ref_initials(m):
|
||
anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(1), PLACEHOLDERS["NOM"]))
|
||
anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(2), PLACEHOLDERS["NOM"]))
|
||
prefix = m.group(0)[:m.group(0).index(m.group(1))]
|
||
return prefix + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"]
|
||
final_text = _RE_REF_INITIALS.sub(_clean_ref_initials, final_text)
|
||
|
||
# 4) Consolidation : propager les PII détectés sur toutes les pages (page=-1)
|
||
# pour que la redaction PDF les cherche partout (sidebar répété, etc.)
|
||
|
||
# 4a) Noms : extraire les tokens individuels
|
||
_nom_kinds = {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}
|
||
_global_name_tokens: set = set()
|
||
for h in anon.audit:
|
||
if h.kind not in _nom_kinds:
|
||
continue
|
||
for word in h.original.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 3:
|
||
continue
|
||
if word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
if not word[0].isupper():
|
||
continue
|
||
_global_name_tokens.add(word)
|
||
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
|
||
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
|
||
_COMPANION_BLACKLIST = {
|
||
"ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN",
|
||
"MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE",
|
||
"VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE",
|
||
# Spécialités/services
|
||
"CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
|
||
"CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
|
||
"GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE",
|
||
"OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE",
|
||
"RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE",
|
||
# Termes médicaux/courants FP OGC 21
|
||
"ALIMENTATION", "AUGMENTATION", "AMELIORATION",
|
||
"BILIAIRES", "BILIAIRE", "VOIES", "BILI",
|
||
"MEDECINE", "ENTERO", "DOSSIER", "AVIATION",
|
||
"SULFAMIDES", "CLAVULANIQUE", "MECILLINAM",
|
||
"TAZOBACTAM", "TEMOCILLINE", "ECOFLAC", "FURANES",
|
||
"CONTENTION", "ISOLEMENT", "ELIMINATION",
|
||
"PANCREATITE", "INFECTIEUX", "HEMODYNAMIQUE",
|
||
"SENSIBLE", "VARIABLE", "DOSAGE", "CAT",
|
||
}
|
||
raw_full = "\n\n".join(pages_text)
|
||
_companion_tokens: set = set()
|
||
for token in _global_name_tokens:
|
||
# Token connu suivi d'un mot ALL-CAPS
|
||
for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\b", raw_full):
|
||
candidate = m.group(1)
|
||
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||
and candidate not in _global_name_tokens
|
||
and candidate not in _COMPANION_BLACKLIST):
|
||
_companion_tokens.add(candidate)
|
||
# Mot ALL-CAPS suivi du token connu
|
||
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\s+{re.escape(token)}\b", raw_full):
|
||
candidate = m.group(1)
|
||
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||
and candidate not in _global_name_tokens
|
||
and candidate not in _COMPANION_BLACKLIST):
|
||
_companion_tokens.add(candidate)
|
||
_global_name_tokens.update(_companion_tokens)
|
||
|
||
# Retirer les sous-parties COURTES de noms composés (JEAN si JEAN-PIERRE existe)
|
||
# Garder les parties longues (>=5 chars) car le texte peut les scinder sur des lignes séparées
|
||
_compound = {t for t in _global_name_tokens if "-" in t}
|
||
_parts_to_drop = set()
|
||
for comp in _compound:
|
||
for part in comp.split("-"):
|
||
part = part.strip()
|
||
if len(part) >= 2 and len(part) < 5 and part in _global_name_tokens:
|
||
_parts_to_drop.add(part)
|
||
_global_name_tokens -= _parts_to_drop
|
||
|
||
# 4a-ter) Filtrage final des tokens globaux : rejeter les mots qui ne ressemblent pas à des noms propres
|
||
# - Mots courants français (minuscule initiale déjà filtrés en amont)
|
||
# - ALL-CAPS <= 4 chars confirmés par une seule source seulement
|
||
_nom_kind_counts: Dict[str, set] = {}
|
||
for h in anon.audit:
|
||
if h.kind in _nom_kinds:
|
||
for word in h.original.split():
|
||
word = word.strip(" .-'")
|
||
if word:
|
||
_nom_kind_counts.setdefault(word, set()).add(h.kind)
|
||
_filtered_global: set = set()
|
||
for token in _global_name_tokens:
|
||
# ALL-CAPS court (<=4) avec une seule source → probablement une abréviation
|
||
if token.isupper() and len(token) <= 4 and len(_nom_kind_counts.get(token, set())) < 2:
|
||
continue
|
||
_filtered_global.add(token)
|
||
_global_name_tokens = _filtered_global
|
||
|
||
# DÉSACTIVÉ: NOM_GLOBAL génère 670 FP avec 0 TP (100% faux positifs)
|
||
# La propagation globale des noms est trop agressive
|
||
# for token in _global_name_tokens:
|
||
# anon.audit.append(PiiHit(page=-1, kind="NOM_GLOBAL", original=token, placeholder=PLACEHOLDERS["NOM"]))
|
||
|
||
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
|
||
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
|
||
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
|
||
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER", "NDA", "EPISODE"}
|
||
|
||
_global_pii: Dict[str, set] = {}
|
||
for h in anon.audit:
|
||
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
|
||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER", "NDA",
|
||
"force_term", "force_regex", "FINESS"}:
|
||
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
|
||
if h.kind == "DATE_NAISSANCE":
|
||
# Extraire la date pure (DD/MM/YYYY ou DD/MM/YY)
|
||
date_match = re.search(r'(\d{1,2})[/.\-\s]+(\d{1,2})[/.\-\s]+(\d{2,4})', h.original)
|
||
if date_match:
|
||
day, month, year = date_match.groups()
|
||
# Normaliser les composants (ajouter zéro si nécessaire)
|
||
day = day.zfill(2)
|
||
month = month.zfill(2)
|
||
# Générer toutes les variations de séparateurs
|
||
date_variations = [
|
||
f"{day}/{month}/{year}",
|
||
f"{day}.{month}.{year}",
|
||
f"{day}-{month}-{year}",
|
||
f"{day} {month} {year}",
|
||
]
|
||
for var in date_variations:
|
||
_global_pii.setdefault(h.kind, set()).add(var)
|
||
else:
|
||
# Fallback : ajouter tel quel si pas de match
|
||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||
else:
|
||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||
|
||
# Propager UNIQUEMENT les PII critiques (évite les 951 FP des autres types)
|
||
for kind, values in _global_pii.items():
|
||
if kind not in _CRITICAL_PII_TYPES:
|
||
continue # Skip non-critical PII (TEL, ADRESSE, etc.)
|
||
|
||
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
||
for val in values:
|
||
if not val or len(val) < 3: # Skip valeurs trop courtes
|
||
continue
|
||
anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder))
|
||
|
||
log.info("Propagation globale sélective : %d types critiques propagés",
|
||
sum(1 for k in _global_pii.keys() if k in _CRITICAL_PII_TYPES))
|
||
|
||
# 4e) Appliquer les tokens globaux sur le texte pseudonymisé
|
||
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL"}
|
||
for h in anon.audit:
|
||
if h.page != -1:
|
||
continue
|
||
if not (h.kind == "NOM_GLOBAL" or h.kind.endswith("_GLOBAL")):
|
||
continue
|
||
if h.kind in _GLOBAL_SKIP_KINDS:
|
||
continue
|
||
token = h.original.strip()
|
||
if not token or len(token) < 4:
|
||
continue
|
||
# Garde trackare : NOM_GLOBAL très court (<=4) risque de masquer des codes diagnostics/acronymes
|
||
if anon.is_trackare and h.kind == "NOM_GLOBAL" and len(token) <= 4:
|
||
continue
|
||
|
||
try:
|
||
# Traitement spécial pour DATE_NAISSANCE_GLOBAL : gérer les variations de format et contexte
|
||
if h.kind == "DATE_NAISSANCE_GLOBAL":
|
||
# Extraire les composants de la date (DD/MM/YYYY ou variations)
|
||
date_match = re.search(r'(\d{1,2})[/.\-\s]+(\d{1,2})[/.\-\s]+(\d{2,4})', token)
|
||
if date_match:
|
||
day, month, year = date_match.groups()
|
||
# Pattern flexible qui accepte tous les séparateurs
|
||
# [\s/.\-]+ accepte : espace, slash, point, tiret (un ou plusieurs)
|
||
date_pattern = rf'{day}[\s/.\-]+{month}[\s/.\-]+{year}'
|
||
|
||
# Multi-pass replacement pour couvrir tous les cas
|
||
# Pass 1 : Avec contexte "Né(e) le" (case-insensitive)
|
||
final_text = re.sub(
|
||
rf'Né(?:e)?\s+le\s+{date_pattern}',
|
||
h.placeholder,
|
||
final_text,
|
||
flags=re.IGNORECASE
|
||
)
|
||
# Pass 2 : Sans contexte (date seule)
|
||
final_text = re.sub(
|
||
rf'\b{date_pattern}\b',
|
||
h.placeholder,
|
||
final_text,
|
||
flags=re.IGNORECASE
|
||
)
|
||
continue
|
||
|
||
# Traitement spécial pour force_term : remplacement case-insensitive avec word boundaries
|
||
if h.kind == "force_term_GLOBAL":
|
||
# Échapper les caractères spéciaux mais garder la flexibilité
|
||
pat = re.escape(token)
|
||
final_text = re.sub(rf'\b{pat}\b', h.placeholder, final_text, flags=re.IGNORECASE)
|
||
continue
|
||
|
||
# Traitement standard pour les autres types
|
||
pat = re.escape(token)
|
||
# Noms composés : tolérer les sauts de ligne/espaces autour du tiret
|
||
if "-" in token:
|
||
pat = pat.replace(r"\-", r"\-\s*")
|
||
# Dates : tolérer variations de séparateurs
|
||
if "/" in token or "." in token:
|
||
pat = pat.replace(r"\.", r"[\s/.\-]").replace(r"\/", r"[\s/.\-]")
|
||
|
||
final_text = re.sub(rf"\b{pat}\b", h.placeholder, final_text, flags=re.IGNORECASE)
|
||
except re.error:
|
||
final_text = final_text.replace(token, h.placeholder)
|
||
|
||
# Log OCR dans l'audit
|
||
if ocr_used:
|
||
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
|
||
|
||
# Filtrer les faux positifs hospitaliers
|
||
if _HOSPITAL_FILTER_AVAILABLE:
|
||
try:
|
||
hospital_filter = HospitalFilter()
|
||
original_count = len(anon.audit)
|
||
|
||
# Convertir les PiiHit en format dict pour le filtre
|
||
detections = [
|
||
{
|
||
'kind': hit.kind,
|
||
'original': hit.original,
|
||
'page': hit.page
|
||
}
|
||
for hit in anon.audit
|
||
]
|
||
|
||
# Filtrer (passer le flag is_trackare)
|
||
filtered_detections = hospital_filter.filter_detections(
|
||
detections,
|
||
pdf_path.name,
|
||
is_trackare=anon.is_trackare
|
||
)
|
||
|
||
# Reconstruire la liste anon.audit
|
||
filtered_audit = []
|
||
for det in filtered_detections:
|
||
# Trouver le PiiHit original correspondant
|
||
for hit in anon.audit:
|
||
if (hit.kind == det['kind'] and
|
||
hit.original == det['original'] and
|
||
hit.page == det['page']):
|
||
filtered_audit.append(hit)
|
||
break
|
||
|
||
anon.audit = filtered_audit
|
||
filtered_count = original_count - len(anon.audit)
|
||
|
||
if filtered_count > 0:
|
||
log.info("Filtre hospitalier : %d faux positifs éliminés", filtered_count)
|
||
except Exception as e:
|
||
log.warning("Erreur lors du filtrage hospitalier : %s", e)
|
||
|
||
# Sécurité : supprimer tout bloc [TABLES] résiduel (ne devrait plus arriver)
|
||
final_text = re.sub(r"\n*\[TABLES\].*?\[/TABLES\]\n*", "\n", final_text, flags=re.DOTALL)
|
||
|
||
# Nettoyage crochets doubles : [[PLACEHOLDER]] → [PLACEHOLDER] (artefact quand
|
||
# le PDF original avait déjà des crochets autour de la valeur masquée)
|
||
_RE_BRACKET_CLEAN = re.compile(
|
||
r"\[+(\[(?:NOM|TEL|EMAIL|VILLE|ADRESSE|CODE_POSTAL|FINESS|ETABLISSEMENT|MASK|IPP|"
|
||
r"DOSSIER|NDA|EPISODE|RPPS|DATE_NAISSANCE|AGE|NIR|IBAN|OGC)\])\]+"
|
||
)
|
||
final_text = _RE_BRACKET_CLEAN.sub(r"\1", final_text)
|
||
|
||
# Sauvegardes
|
||
base = pdf_path.stem
|
||
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||
audit_path = out_dir / f"{base}.audit.jsonl"
|
||
txt_path.write_text(final_text, encoding="utf-8")
|
||
|
||
# Filtrer les entrées de propagation globale (page=-1) avant d'écrire l'audit
|
||
# Ces entrées sont utilisées pour le remplacement dans le texte mais ne sont pas des détections réelles
|
||
audit_for_file = [hit for hit in anon.audit if hit.page != -1]
|
||
|
||
with audit_path.open("w", encoding="utf-8") as f:
|
||
for hit in audit_for_file:
|
||
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
|
||
outputs = {"text": str(txt_path), "audit": str(audit_path)}
|
||
|
||
# PDFs
|
||
if make_vector_redaction and fitz is not None:
|
||
vec_path = out_dir / f"{base}.redacted_vector.pdf"
|
||
try:
|
||
redact_pdf_vector(pdf_path, anon.audit, vec_path, ocr_word_map=ocr_word_map)
|
||
outputs["pdf_vector"] = str(vec_path)
|
||
except Exception:
|
||
pass
|
||
if also_make_raster_burn and fitz is not None:
|
||
ras_path = out_dir / f"{base}.redacted_raster.pdf"
|
||
redact_pdf_raster(pdf_path, anon.audit, ras_path, ogc_label=ogc_label, ocr_word_map=ocr_word_map)
|
||
outputs["pdf_raster"] = str(ras_path)
|
||
return outputs
|
||
|
||
|
||
def process_pdfs_batch(
|
||
pdf_paths: List[Path],
|
||
out_dir: Path,
|
||
max_workers: int = None,
|
||
**kwargs,
|
||
) -> List[Dict[str, str]]:
|
||
"""Traite plusieurs PDFs en parallèle (ProcessPoolExecutor).
|
||
|
||
Ne fonctionne que quand ner_manager=None (les modèles NER ne sont pas
|
||
picklables). Quand NER est actif, les PDFs restent séquentiels mais
|
||
bénéficient de la parallélisation page-level de redact_pdf_raster().
|
||
"""
|
||
if not pdf_paths:
|
||
return []
|
||
if max_workers is None:
|
||
max_workers = min(len(pdf_paths), os.cpu_count() or 4)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
def _one(pdf_path):
|
||
return process_pdf(pdf_path, out_dir, **kwargs)
|
||
|
||
with ProcessPoolExecutor(max_workers=max_workers) as pool:
|
||
return list(pool.map(_one, pdf_paths))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)")
|
||
ap.add_argument("pdf", type=str)
|
||
ap.add_argument("--out", type=str, default="out")
|
||
ap.add_argument("--no-vector", action="store_true")
|
||
ap.add_argument("--raster", action="store_true")
|
||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
|
||
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
|
||
args = ap.parse_args()
|
||
manager = None
|
||
if args.hf and NerModelManager is not None:
|
||
manager = NerModelManager(cache_dir=Path("models"))
|
||
manager.load(args.model)
|
||
outs = process_pdf(
|
||
Path(args.pdf),
|
||
Path(args.out),
|
||
make_vector_redaction=not args.no_vector,
|
||
also_make_raster_burn=args.raster,
|
||
config_path=Path(args.config),
|
||
use_hf=bool(args.hf),
|
||
ner_manager=manager,
|
||
ner_thresholds=NerThresholds() if NerThresholds else None,
|
||
)
|
||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|