Le pattern type utilisait [Cc]entre\s+[Hh]ospitalier : seule la 1re lettre de chaque mot était ambidextre, la suite devait être en minuscules. "CENTRE HOSPITALIER COTE BASQUE" (tout majuscule) échappait → compensé par regex YAML force_mask_regex "Centre\s+Hospitalier\s+…". Fix : utiliser (?i:…) case-insensitive localement sur les sous-motifs "type d'établissement" et "déterminants" (de, du, la…) tout en gardant le nom propre strict (1re lettre majuscule obligatoire). Évite les FP tout en capturant les majuscules complètes. Cas validés : - "Centre Hospitalier de Bayonne" → match (inchangé) - "CENTRE HOSPITALIER COTE BASQUE" → match (nouveau) - "POLYCLINIQUE CÔTE BASQUE SUD" → match (nouveau) - "CLINIQUE SAINT-JEAN" → match (nouveau) - "examen hôpital de Bordeaux" → pas de match (exclusion préservée) Test YAML stripped : CENTRE HOSPITALIER et COTE BASQUE sont maintenant masqués par ETAB (regex/AC) au lieu de force_term. Après ce fix + Fix #4, on peut retirer les regex "Centre\s+Hospitalier…" et "Polyclinique…" du YAML. Non-régression : 122 hits sur trackare-18007562 avec YAML complet. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
4859 lines
226 KiB
Python
4859 lines
226 KiB
Python
#!/usr/bin/env python3
|
||
# -*- coding: utf-8 -*-
|
||
"""
|
||
Core d'anonymisation (v2.1) + NER ONNX (optionnel, narratif uniquement)
|
||
------------------------------------------------------------------------
|
||
- Extraction 2 passes (pdfplumber -> pdfminer) + fallback 3e passe PyMuPDF si texte pauvre ou (cid:xx)
|
||
- Règles regex (PII critiques) + clé:valeur (masquer valeur seulement) + overrides YAML
|
||
- Rescan sécurité **sélectif** (EMAIL/TEL/IBAN/NIR), jamais dans [TABLES]
|
||
- Redaction PDF (vector/raster) via PyMuPDF
|
||
- NER ONNX **optionnel** (CamemBERT family) appliqué **après** les règles, sur le narratif
|
||
|
||
Dépendances : pdfplumber, pdfminer.six, pillow, pymupdf, pyyaml (optionnel), transformers, optimum, onnxruntime
|
||
"""
|
||
from __future__ import annotations
|
||
import io
|
||
import json
|
||
import logging
|
||
import os
|
||
import re
|
||
import sys
|
||
from concurrent.futures import ProcessPoolExecutor
|
||
|
||
log = logging.getLogger(__name__)
|
||
from dataclasses import dataclass, field
|
||
from pathlib import Path
|
||
from typing import List, Dict, Tuple, Optional, Any
|
||
|
||
# {page_idx: [(word_text, x0_norm, y0_norm, x1_norm, y1_norm), ...]}
|
||
# Coordonnées normalisées 0→1 (format natif docTR word.geometry)
|
||
OcrWordMap = Dict[int, List[Tuple[str, float, float, float, float]]]
|
||
|
||
import pdfplumber
|
||
from pdfminer.high_level import extract_text as pdfminer_extract_text
|
||
from pdfminer.layout import LAParams
|
||
from PIL import Image, ImageDraw
|
||
|
||
try:
|
||
import fitz # PyMuPDF
|
||
except Exception:
|
||
fitz = None
|
||
|
||
try:
|
||
import yaml # PyYAML for dictionaries
|
||
except Exception:
|
||
yaml = None
|
||
|
||
try:
|
||
from doctr.models import ocr_predictor as _doctr_ocr_predictor
|
||
_DOCTR_AVAILABLE = True
|
||
except Exception:
|
||
_doctr_ocr_predictor = None # type: ignore
|
||
_DOCTR_AVAILABLE = False
|
||
|
||
try:
|
||
from detectors.hospital_filter import HospitalFilter
|
||
_HOSPITAL_FILTER_AVAILABLE = True
|
||
except Exception:
|
||
_HOSPITAL_FILTER_AVAILABLE = False
|
||
HospitalFilter = None # type: ignore
|
||
|
||
# NER manager (facultatif)
|
||
try:
|
||
from ner_manager_onnx import NerModelManager, NerThresholds
|
||
except Exception:
|
||
NerModelManager = None # type: ignore
|
||
NerThresholds = None # type: ignore
|
||
|
||
# EDS-Pseudo manager (facultatif)
|
||
try:
|
||
from eds_pseudo_manager import EdsPseudoManager
|
||
except Exception:
|
||
EdsPseudoManager = None # type: ignore
|
||
|
||
# VLM manager (facultatif)
|
||
try:
|
||
from vlm_manager import VlmManager
|
||
except Exception:
|
||
VlmManager = None # type: ignore
|
||
|
||
|
||
def _load_edsnlp_drug_names() -> set:
|
||
"""Charge les noms de médicaments mono-mot depuis edsnlp/resources/drugs.json.
|
||
Retourne un set lowercase. Fallback silencieux si edsnlp absent."""
|
||
try:
|
||
import edsnlp as _edsnlp
|
||
drugs_path = _edsnlp.BASE_DIR / "resources" / "drugs.json"
|
||
if not drugs_path.exists():
|
||
return set()
|
||
import json as _json
|
||
data = _json.loads(drugs_path.read_text(encoding="utf-8"))
|
||
result = set()
|
||
for _code, names in data.items():
|
||
for name in names:
|
||
if " " not in name and len(name) >= 4:
|
||
result.add(name.lower())
|
||
return result
|
||
except Exception:
|
||
return set()
|
||
|
||
|
||
def _load_bdpm_medication_names() -> set:
|
||
"""Charge les noms de médicaments depuis la base BDPM (data/bdpm/medication_names.txt).
|
||
Retourne un set lowercase. ~5700 noms commerciaux et DCI."""
|
||
bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medication_names.txt"
|
||
if not bdpm_path.exists():
|
||
return set()
|
||
try:
|
||
names = set()
|
||
for line in bdpm_path.read_text(encoding="utf-8").splitlines():
|
||
w = line.strip()
|
||
if w and len(w) >= 3:
|
||
names.add(w.lower())
|
||
return names
|
||
except Exception:
|
||
return set()
|
||
|
||
|
||
# ----------------- Gazetteers INSEE (prénoms + communes + noms de famille) -----------------
|
||
# Prénoms et noms de famille sont utilisés sous deux formes :
|
||
# - _INSEE_PRENOMS (lowercase) : check rapide "w.lower() in _INSEE_PRENOMS"
|
||
# - _INSEE_PRENOMS_SET (uppercase sans accents, normalisé NFKD) : cross-validation NER
|
||
# Une seule lecture fichier alimente les deux sets (avant : 2 passes disque pour
|
||
# le même fichier prenoms_france.txt, reliquat du refactoring NER-first).
|
||
_INSEE_PRENOMS: set = set() # lowercase
|
||
_INSEE_PRENOMS_SET: set = set() # uppercase sans accents
|
||
_INSEE_COMMUNES: set = set() # uppercase
|
||
_INSEE_NOMS_FAMILLE: set = set() # uppercase sans accents
|
||
|
||
|
||
def _normalize_nfkd_upper(s: str) -> str:
|
||
"""Supprime les accents et met en majuscules (pour matching INSEE)."""
|
||
import unicodedata
|
||
return "".join(
|
||
c for c in unicodedata.normalize("NFD", s)
|
||
if unicodedata.category(c) != "Mn"
|
||
).upper()
|
||
|
||
|
||
def _load_insee_gazetteers():
|
||
"""Charge les gazetteers INSEE en une seule passe par fichier.
|
||
Alimente _INSEE_PRENOMS (lowercase) et _INSEE_PRENOMS_SET (uppercase sans accents)
|
||
depuis le même fichier prenoms_france.txt."""
|
||
global _INSEE_PRENOMS, _INSEE_PRENOMS_SET, _INSEE_COMMUNES, _INSEE_NOMS_FAMILLE
|
||
data_dir = Path(__file__).parent / "data" / "insee"
|
||
|
||
# Prénoms : lecture unique, deux formes dérivées
|
||
prenoms_path = data_dir / "prenoms_france.txt"
|
||
if prenoms_path.exists():
|
||
try:
|
||
prenoms_lc = set()
|
||
prenoms_nfkd = set()
|
||
for line in prenoms_path.read_text(encoding="utf-8").splitlines():
|
||
raw = line.strip()
|
||
if raw and len(raw) >= 3:
|
||
prenoms_lc.add(raw.lower())
|
||
prenoms_nfkd.add(_normalize_nfkd_upper(raw))
|
||
_INSEE_PRENOMS = prenoms_lc
|
||
_INSEE_PRENOMS_SET = prenoms_nfkd
|
||
log.info(f"Gazetteers INSEE prénoms: {len(_INSEE_PRENOMS)} entrées "
|
||
f"(lowercase + uppercase-nfkd)")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement prénoms INSEE: {e}")
|
||
|
||
# Communes (uppercase, >= 3 chars)
|
||
communes_path = data_dir / "communes_france.txt"
|
||
if communes_path.exists():
|
||
try:
|
||
_INSEE_COMMUNES = {
|
||
line.strip().upper() for line in communes_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 3
|
||
}
|
||
log.info(f"Gazetteers INSEE communes: {len(_INSEE_COMMUNES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement communes INSEE: {e}")
|
||
|
||
# Noms de famille (uppercase sans accents)
|
||
noms_path = data_dir / "noms_famille_france.txt"
|
||
if noms_path.exists():
|
||
try:
|
||
_INSEE_NOMS_FAMILLE = {
|
||
_normalize_nfkd_upper(line.strip())
|
||
for line in noms_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 3
|
||
}
|
||
log.info(f"Gazetteers INSEE noms de famille: {len(_INSEE_NOMS_FAMILLE)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement noms de famille INSEE: {e}")
|
||
|
||
|
||
_load_insee_gazetteers()
|
||
|
||
|
||
# ----------------- Gazetteer FINESS (établissements de santé) -----------------
|
||
_FINESS_NUMBERS: set = set() # numéros FINESS 9 chiffres (structure + entjur)
|
||
_FINESS_TELEPHONES: set = set() # téléphones 10 chiffres
|
||
_FINESS_VILLES: set = set() # villes FINESS (uppercase)
|
||
_FINESS_AC = None # Automate Aho-Corasick pour noms distinctifs
|
||
_FINESS_ADDR_AC = None # Automate Aho-Corasick pour adresses (noms de voie)
|
||
_VILLE_AC = None # Automate Aho-Corasick pour villes (INSEE + FINESS)
|
||
|
||
# Communes trop ambiguës (homonymes de mots courants, trop courts, etc.)
|
||
_VILLE_BLACKLIST = {
|
||
# Directions / mots géographiques génériques
|
||
"SAINT", "NORD", "SUD", "EST", "OUEST",
|
||
"CENTRE", "SERVICE", "BOURG",
|
||
# Communes homonymes de mots courants français
|
||
"ORANGE", "TOURS", "NICE", "SENS", "VITRE",
|
||
"ROMANS", "MENTON", "SALON", "VIENNE",
|
||
"BREST", # trop court et ambigu
|
||
"HYERES", # proche de termes médicaux
|
||
"AGEN", "AUCH", "ALBI",
|
||
"BLOIS", "LAON", "LENS",
|
||
"GIEN", "GRAY",
|
||
"AIRE", "LURE", "SETE", "DOLE",
|
||
"VIRE", "LUNEL", "MURET", "MORET",
|
||
"COEUR", "FOIX", "GIVET",
|
||
"EVIAN", "MAURE", "MENDE",
|
||
"JOUE", "MEAUX", "REDON",
|
||
"CREIL", "CERGY",
|
||
# Communes de 4-5 lettres homonymes de mots très courants
|
||
"VERS", "MONT", "MARS", "PORT", "PONT", "FORT",
|
||
"BOIS", "ISLE", "LACS", "MURS", "OUST", "PREY",
|
||
"VAUX", "VERT", "FAUX", "REZE",
|
||
"BILLE", "PLACE", "VILLE", "COURS", "GRAND",
|
||
"ROUGE", "RICHE", "NUITS", "SORE", "SARE",
|
||
"TRANS", "RANS", "MARSA",
|
||
# Mots courants français (6+ lettres) aussi communes
|
||
"CHARGE", "SIGNES", "BARRES", "FOSSES", "GARDES",
|
||
"MARCHE", "LIGNES", "MOULIN", "PIERRE", "CHAISE",
|
||
"SOURCE", "VALLEE", "MAISON", "BEAUNE", "CORPS",
|
||
"PUITS", "CROIX", "LIGNE", "QUATRE", "PRISON",
|
||
# Prénoms très courants (aussi communes)
|
||
"MARIE", "PIERRE", "JEAN", "PAUL", "ANNE",
|
||
# Expressions composées ambiguës (aussi communes INSEE)
|
||
"LONG", "RECY", "PLAN", "MARCHE", "SALLE",
|
||
"CONTRE", "MERE", "ONDRES", "VEBRE",
|
||
# Mots structurels / médicaux
|
||
"PARIS", # omniprésent, source de faux positifs
|
||
"FRANCE", "EUROPE",
|
||
# Termes ambigus (aussi communes INSEE) - trackare/DPI
|
||
"COURANT", # "Médecin courant" ≠ ville
|
||
# Parties du corps homonymes de communes (FP "prurit invalidant (COU, décolleté)")
|
||
"COU", "DOS", "SEIN", "BRAS",
|
||
}
|
||
# Enrichissement depuis fichier externe (modifiable sans toucher au code)
|
||
_villes_bl_file = Path(__file__).parent / "data" / "villes_blacklist.txt"
|
||
if _villes_bl_file.exists():
|
||
try:
|
||
for _line in _villes_bl_file.read_text(encoding="utf-8").splitlines():
|
||
_w = _line.strip()
|
||
if _w and not _w.startswith("#"):
|
||
_VILLE_BLACKLIST.add(_w)
|
||
log.info("Villes blacklist chargées : %d entrées", len(_VILLE_BLACKLIST))
|
||
except Exception as _exc:
|
||
log.error("Villes blacklist : erreur de lecture %s — %s", _villes_bl_file, _exc)
|
||
else:
|
||
log.warning("Villes blacklist : fichier introuvable %s — défauts intégrés utilisés", _villes_bl_file)
|
||
|
||
try:
|
||
import ahocorasick as _ahocorasick
|
||
_AHO_AVAILABLE = True
|
||
except ImportError:
|
||
_ahocorasick = None
|
||
_AHO_AVAILABLE = False
|
||
|
||
def _normalize_for_matching(s: str) -> str:
|
||
"""Normalise pour matching gazetteer : lowercase, sans accents, espaces collapsés."""
|
||
import unicodedata
|
||
s = s.lower().strip()
|
||
s = unicodedata.normalize("NFD", s)
|
||
s = "".join(c for c in s if unicodedata.category(c) != "Mn")
|
||
s = re.sub(r"[^a-z0-9\s\-]", " ", s)
|
||
s = re.sub(r"\s+", " ", s).strip()
|
||
return s
|
||
|
||
|
||
def _load_finess_gazetteers():
|
||
"""Charge les gazetteers FINESS (numéros, téléphones, villes, Aho-Corasick)."""
|
||
global _FINESS_NUMBERS, _FINESS_TELEPHONES, _FINESS_VILLES, _FINESS_AC
|
||
data_dir = Path(__file__).parent / "data" / "finess"
|
||
|
||
# Numéros FINESS
|
||
finess_path = data_dir / "finess_numbers.txt"
|
||
if finess_path.exists():
|
||
try:
|
||
_FINESS_NUMBERS = {
|
||
line.strip() for line in finess_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip()
|
||
}
|
||
log.info(f"Gazetteer FINESS numéros: {len(_FINESS_NUMBERS)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS numéros: {e}")
|
||
|
||
# etablissements_noms.txt volontairement PAS chargé — utilisé uniquement pour
|
||
# debug/inspection. Le matching des noms passe par l'Aho-Corasick construit
|
||
# sur etablissements_distinctifs.txt (chargement différé).
|
||
|
||
# Villes FINESS
|
||
villes_path = data_dir / "villes_finess.txt"
|
||
if villes_path.exists():
|
||
try:
|
||
_FINESS_VILLES = {
|
||
line.strip() for line in villes_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip() and len(line.strip()) >= 3
|
||
}
|
||
log.info(f"Gazetteer FINESS villes: {len(_FINESS_VILLES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS villes: {e}")
|
||
|
||
# Téléphones (pour validation)
|
||
tel_path = data_dir / "telephones.txt"
|
||
if tel_path.exists():
|
||
try:
|
||
_FINESS_TELEPHONES = {
|
||
line.strip() for line in tel_path.read_text(encoding="utf-8").splitlines()
|
||
if line.strip()
|
||
}
|
||
log.info(f"Gazetteer FINESS téléphones: {len(_FINESS_TELEPHONES)} entrées")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement FINESS téléphones: {e}")
|
||
|
||
_load_finess_gazetteers()
|
||
|
||
|
||
# ----------------- Whitelists Médicales -----------------
|
||
_MEDICAL_STRUCTURAL_TERMS = set()
|
||
_MEDICATION_WHITELIST = set()
|
||
|
||
def load_medical_whitelists():
|
||
"""Charge les whitelists médicales (termes structurels + médicaments)."""
|
||
global _MEDICAL_STRUCTURAL_TERMS, _MEDICATION_WHITELIST
|
||
|
||
# 1. Charger les termes médicaux structurels
|
||
config_path = Path("config/medical_terms_whitelist.yml")
|
||
if config_path.exists() and yaml:
|
||
try:
|
||
with open(config_path, 'r', encoding='utf-8') as f:
|
||
data = yaml.safe_load(f)
|
||
terms = data.get('medical_structural_terms', [])
|
||
_MEDICAL_STRUCTURAL_TERMS = {t.lower() for t in terms}
|
||
log.info(f"Whitelist termes médicaux chargée: {len(_MEDICAL_STRUCTURAL_TERMS)} termes")
|
||
except Exception as e:
|
||
log.warning(f"Erreur chargement whitelist médicale: {e}")
|
||
|
||
# 2. Charger la whitelist des médicaments (edsnlp + BDPM + manuels)
|
||
_MEDICATION_WHITELIST = _load_edsnlp_drug_names()
|
||
_MEDICATION_WHITELIST.update(_load_bdpm_medication_names())
|
||
# Ajouter médicaments manquants
|
||
additional_meds = {
|
||
"idacio", "salazopyrine", "infliximab", "apranax",
|
||
"ketoprofene", "prevenar", "pneumovax", "bétadine"
|
||
}
|
||
_MEDICATION_WHITELIST.update(additional_meds)
|
||
log.info(f"Whitelist médicaments chargée: {len(_MEDICATION_WHITELIST)} médicaments (edsnlp+BDPM)")
|
||
|
||
# Charger les whitelists au démarrage du module
|
||
load_medical_whitelists()
|
||
|
||
|
||
# ----------------- Defaults & Config -----------------
|
||
DEFAULTS_CFG = {
|
||
"version": 1,
|
||
"encoding": "utf-8",
|
||
"normalization": "NFKC",
|
||
"whitelist": {
|
||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||
"org_gpe_keep": False,
|
||
},
|
||
"blacklist": {
|
||
"force_mask_terms": [],
|
||
"force_mask_regex": [],
|
||
},
|
||
"kv_labels_preserve": ["FINESS", "IPP", "N° OGC", "Etablissement"],
|
||
"regex_overrides": [
|
||
{
|
||
"name": "OGC_court",
|
||
"pattern": r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,3})\b",
|
||
"placeholder": "[OGC]",
|
||
"flags": ["IGNORECASE"],
|
||
}
|
||
],
|
||
"flags": {
|
||
"case_insensitive": True,
|
||
"unicode_word_boundaries": True,
|
||
"regex_engine": "python",
|
||
},
|
||
}
|
||
|
||
PLACEHOLDERS = {
|
||
"EMAIL": "[EMAIL]",
|
||
"TEL": "[TEL]",
|
||
"IBAN": "[IBAN]",
|
||
"NIR": "[NIR]",
|
||
"IPP": "[IPP]",
|
||
"FINESS": "[FINESS]",
|
||
"OGC": "[OGC]",
|
||
"NOM": "[NOM]",
|
||
"VILLE": "[VILLE]",
|
||
"ETAB": "[ETABLISSEMENT]",
|
||
"MASK": "[MASK]",
|
||
"DATE": "[DATE]",
|
||
"DATE_NAISSANCE": "[DATE_NAISSANCE]",
|
||
"ADRESSE": "[ADRESSE]",
|
||
"CODE_POSTAL": "[CODE_POSTAL]",
|
||
"AGE": "[AGE]",
|
||
"DOSSIER": "[DOSSIER]",
|
||
"NDA": "[NDA]",
|
||
"EPISODE": "[EPISODE]",
|
||
"RPPS": "[RPPS]",
|
||
}
|
||
|
||
CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
|
||
|
||
# Baseline regex
|
||
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||
RE_URL = re.compile(r"https?://[A-Za-z0-9._~:/?#\[\]@!$&'()*+,;=\-%]+", re.IGNORECASE)
|
||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?(?:\(0\))?\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
|
||
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
|
||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
|
||
RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
|
||
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
|
||
RE_NIR = re.compile(
|
||
r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
|
||
def validate_nir(nir_raw: str) -> bool:
|
||
"""Vérifie la clé modulo 97 d'un NIR (13 chiffres + 2 clé). Supporte la Corse (2A/2B)."""
|
||
digits_only = re.sub(r"\s+", "", nir_raw)
|
||
if len(digits_only) < 15:
|
||
return False
|
||
body_str = digits_only[:13]
|
||
key_str = digits_only[13:15]
|
||
# Corse : 2A → 19, 2B → 18 (pour le calcul)
|
||
body_str_calc = body_str.upper().replace("2A", "19").replace("2B", "18")
|
||
try:
|
||
body_int = int(body_str_calc)
|
||
key_int = int(key_str)
|
||
except ValueError:
|
||
return False
|
||
return key_int == (97 - (body_int % 97))
|
||
|
||
# Mots médicaux/techniques/courants qui ne sont pas des noms de personnes
|
||
_MEDICAL_STOP_WORDS_SET = {
|
||
# Mots français courants (déterminants, prépositions, adverbes, etc.)
|
||
"pas", "mon", "bien", "ancien", "ancienne", "bon", "bonne", "tout", "tous",
|
||
"mais", "donc", "car", "que", "qui", "avec", "dans", "pour", "sur", "par",
|
||
"les", "des", "une", "est", "son", "ses", "nos", "aux", "cette", "ces",
|
||
"cher", "chez", "entre", "sans", "sous", "vers", "selon", "après", "avant",
|
||
"puis", "aussi", "très", "plus", "moins", "peu", "non", "oui", "quelques",
|
||
"mise", "début", "fin", "suite", "fait", "lieu", "cas", "jour", "jours",
|
||
"semaine", "semaines", "mois", "temps", "place", "nouvelle", "nouveau",
|
||
"franche", "légère", "quelque", "depuis", "comme", "encore", "votre",
|
||
"date", "note", "notes", "nom", "heure", "matin", "soir", "midi",
|
||
"signé", "réalisé", "courrier", "cabinet", "rue",
|
||
# Verbes / participes courants
|
||
"remontée", "associée", "réalisée", "débuté", "prolongé", "prolongée",
|
||
"prescrit", "prescrite", "présente", "présent", "absente", "absent",
|
||
"reprise", "introduction", "arrêt", "relais",
|
||
# Titres / rôles hospitaliers
|
||
"chef", "assistant", "assistante", "praticien", "praticienne",
|
||
"docteur", "professeur", "hospitalier", "hospitalière", "hospitaliers",
|
||
"spécialiste", "contractuel", "contractuelle", "titulaire",
|
||
"confrère", "consoeur", "coordonnateur", "coordonnatrice",
|
||
"médecin", "médical", "infirmier", "infirmière",
|
||
"praticiens", "patient", "patiente",
|
||
# Structure hospitalière
|
||
"service", "pôle", "clinique", "consultation", "secrétariat",
|
||
"hôpital", "hôpitaux", "centre", "établissement", "polyclinique",
|
||
# Villes / géographie (pas des noms de personnes)
|
||
"bordeaux", "bayonne", "paris", "lyon", "lille", "marseille",
|
||
"toulouse", "nantes", "montpellier", "pessac", "biarritz", "soustons",
|
||
"basque", "basques", "sud", "côte",
|
||
# Médicaments génériques et spécialités (DCI + noms commerciaux)
|
||
"colchicine", "aspirine", "cortancyl", "bisoprolol", "entresto",
|
||
"methotrexate", "eplerenone", "speciafoldine", "prednisone",
|
||
"corticoïdes", "cortisone",
|
||
"paracetamol", "metformine", "solupred", "novorapid", "abasaglar",
|
||
"lovenox", "methylprednisolone", "potassium", "humalog", "furosemide",
|
||
"insuline", "trulicity", "forxiga", "atorvastatine", "amlodipine",
|
||
"ondansetron", "eliquis", "nebivolol", "gaviscon", "loxen",
|
||
"morphine", "oxycodone", "kardegic", "tercian", "zopiclone",
|
||
"seresta", "tramadol", "alprazolam", "forlax", "levothyrox",
|
||
"bromazepam", "gliclazide", "zymad", "pravastatine", "spiriva",
|
||
"quetiapine", "sertraline", "crestor", "lercanidipine", "amoxicilline",
|
||
"opocalcium", "ferinject", "candesartan", "ceftriaxone", "calcidose",
|
||
"laroxyl", "brintellix", "ketoprofene", "adrenaline", "exacyl",
|
||
"terbutaline", "ipratropium", "actiskenan", "vialebex", "oxynormoro",
|
||
"lansoprazole", "perindopril", "sodium", "velmetia",
|
||
"doliprane", "dafalgan", "efferalgan", "spasfon", "vogalene",
|
||
"augmentin", "inexium", "omeprazole", "pantoprazole", "esomeprazole",
|
||
"ramipril", "lisinopril", "enalapril", "losartan", "valsartan",
|
||
"irbesartan", "olmesartan", "telmisartan", "hydrochlorothiazide",
|
||
"spironolactone", "furosemide", "lasilix", "aldactone",
|
||
"tahor", "crestor", "rosuvastatine", "simvastatine", "fluvastatine",
|
||
"xarelto", "pradaxa", "apixaban", "rivaroxaban", "dabigatran",
|
||
"plavix", "clopidogrel", "ticagrelor", "brilique",
|
||
"ventoline", "seretide", "symbicort", "salmeterol", "fluticasone",
|
||
"salbutamol", "tiotropium", "budesonide", "beclometasone",
|
||
"oxycodone", "oxynorm", "skenan", "actiskenan", "fentanyl",
|
||
"nubain", "nalbuphine", "nefopam", "acupan", "profenid",
|
||
"ibuprofene", "diclofenac", "naproxene", "celecoxib",
|
||
"gabapentine", "pregabaline", "lyrica", "neurontin",
|
||
"amitriptyline", "duloxetine", "venlafaxine", "fluoxetine",
|
||
"paroxetine", "escitalopram", "citalopram", "mirtazapine",
|
||
"olanzapine", "risperidone", "aripiprazole", "haloperidol",
|
||
"loxapine", "cyamemazine", "diazepam", "oxazepam", "lorazepam",
|
||
"clonazepam", "midazolam", "hydroxyzine", "atarax", "melatonine",
|
||
"stilnox", "zolpidem", "imovane",
|
||
"levothyroxine", "metformine", "glimepiride", "sitagliptine",
|
||
"januvia", "jardiance", "empagliflozine", "dapagliflozine",
|
||
"ozempic", "semaglutide", "dulaglutide", "liraglutide", "victoza",
|
||
"heparine", "enoxaparine", "tinzaparine", "innohep",
|
||
"warfarine", "coumadine", "fluindione", "previscan",
|
||
"ciprofloxacine", "levofloxacine", "ofloxacine", "metronidazole",
|
||
"vancomycine", "gentamicine", "tazocilline", "piperacilline",
|
||
"meropenem", "imipenem", "clindamycine", "doxycycline",
|
||
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
|
||
"polyionique", "propranolol", "apidra", "solostar",
|
||
# Noms et suffixes laboratoires pharmaceutiques
|
||
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
|
||
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
|
||
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
|
||
"evolugen", "alter", "zydus", "medisol", "substipharm",
|
||
"sdz", "bgr", "egt", "rnb",
|
||
# Formes galéniques / voies d'administration
|
||
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
|
||
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
|
||
"unidose", "perf", "inh", "seringue", "aerosol", "sach", "pdr",
|
||
"orodisp", "capsule", "patch", "suppositoire", "gouttes",
|
||
# Termes de prescription / pharmacie
|
||
"prescription", "prescriptions", "dose", "fréquence", "statut",
|
||
"technique", "capteur", "bandelettes", "glycemiques", "glycemique",
|
||
"lancettes", "aiguilles", "fines", "micro", "pompe", "réserve",
|
||
"glycemie", "capillaire", "hgt",
|
||
# Termes médicaux / cliniques
|
||
"myocardite", "myosite", "corticothérapie", "biopsie", "pathologie",
|
||
"dysimmunitaire", "récidive", "récidivante", "traitement", "diagnostic",
|
||
"antécédents", "examen", "bilan", "résultats", "analyse",
|
||
"interne", "externe", "médecine", "chirurgie", "rhumatologie",
|
||
"dermatologie", "immunologie", "cardiologie", "pneumologie",
|
||
"neurologie", "gynécologie", "radiologie", "sénologie",
|
||
"douleur", "douleurs", "douloureux", "musculaire", "musculaires",
|
||
"thoracique", "thoraciques", "membres", "supérieurs", "inférieurs",
|
||
"normale", "normaux", "habituelle", "habituelles",
|
||
"synthèse", "hospitalisation", "syndrome", "vaccination", "ophtalmo",
|
||
"pelvien", "diabétique", "sommeil", "régime", "diet",
|
||
"desinfection", "environnement", "identification", "bracelet",
|
||
"toilettes", "accompagner", "installer", "transfusion",
|
||
"signes", "vitaux", "alimentaire", "avis", "zone",
|
||
"calcémie",
|
||
# Abréviations médicales
|
||
"irm", "ett", "ecg", "mtx", "fevg", "bdc", "crp", "sfu", "hdj",
|
||
"bnp", "asat", "alat", "cpk", "ctc", "hba", "hba1c",
|
||
"saos", "tsh", "inr", "vgm", "pnn", "plq", "hb",
|
||
"poc", "bax", "act", "bic", "cfx", "acc", "ado", "acf", "vfo",
|
||
"qvl", "cci", "pse", "pca", "chl", "crt", "bbm", "pds", "ren",
|
||
"vit", "zen",
|
||
"scanner", "radio", "écho", "échographie",
|
||
# Spécialités médicales (éviter faux positifs NOM)
|
||
"hépato-gastro-entérologue", "gastro-entérologue", "gastro-entérologie",
|
||
"proctologue", "oncologue", "anesthésiste", "pneumologue", "gérontologue",
|
||
"cardiologue", "néphrologue", "urologue", "gériatre",
|
||
"hépatologue", "endocrinologue", "stomatologue",
|
||
# Termes médicaux / titres fréquemment détectés comme NOM par le NER
|
||
"supplémentation", "supplementation", "endocrinologie", "monsieur", "madame",
|
||
"suivi", "sortie", "emog", "ophtalmo",
|
||
# Médicaments détectés comme NOM/PRENOM par EDS-Pseudo
|
||
"eliquis", "trulicity", "saos", "wind", "taxotere", "eupantol", "ezetimibe",
|
||
"lansoyl", "xatral", "xenetix", "trimbow", "buspirone", "cetirizine",
|
||
"depakote", "versatis", "durogesic", "montelukast", "metformine", "viatris",
|
||
"rosuvastatine", "gliclazide", "amlodipine", "perindopril", "nebivolol",
|
||
"pravastatine", "bisoprolol", "amoxicilline", "kardegic", "lovenox",
|
||
# Termes médicaux / soins / actes détectés comme NOM
|
||
"partielle", "cutanee", "cutané", "cutanée", "osseuse", "diabetique",
|
||
"diabétique", "transdermique", "transderm", "diarrhees", "diarrhées",
|
||
"ionogramme", "scintigraphie", "thoraco", "thorax", "négative", "negative",
|
||
"diététicienne", "pressurise", "pressuriser", "inhalee", "inhalée", "inhal",
|
||
# Mots courants français détectés comme NOM dans les trackare
|
||
"toilette", "repas", "poche", "installation", "education", "éducation",
|
||
"refection", "réfection", "complete", "complète", "regime", "régime",
|
||
"normal", "traité", "traite", "arrêté", "arrete", "volume",
|
||
"commentaires", "france", "covid", "framboise", "epoux", "époux",
|
||
# Abréviations médicales courtes (3-4 chars) détectées comme NOM
|
||
"ide", "ipp", "pcr", "tap", "gel", "ahl", "ssr", "hds", "tca", "etp",
|
||
"mcg", "sdz", "iao", "ser", "orod", "clav", "disp", "cart", "atcd", "mdrd",
|
||
"amox", "endoc", "microg", "item", "pyélo", "néphro",
|
||
# En-têtes de colonnes / mots structurels trackare
|
||
"observations", "observation", "commentaires", "commentaire",
|
||
"surveillance", "température", "temperature", "glycémie", "glycemie",
|
||
"diurèse", "diurese", "balance", "pouls", "systolique", "diastolique",
|
||
"saturation", "fréquence", "frequence", "respiratoire", "douleur",
|
||
"alertes", "alerte", "antécédents", "antecedents", "habitus",
|
||
"allergies", "prescriptions", "prescription", "administration",
|
||
"catégorie", "categorie", "expiration", "message",
|
||
"destination", "diagnostique", "diagnostiques",
|
||
"date", "note", "nom", "heure", "type", "code", "etat",
|
||
"comprime", "comprimé", "gelule", "gélule", "solution", "injectable",
|
||
# Médicaments supplémentaires détectés dans les trackare
|
||
"depakote", "versatis", "humalog", "forxiga", "durogesic",
|
||
"montelukast", "rosuvastatine",
|
||
# Abréviations pharma courtes
|
||
"cpr", "sol", "bic", "agt", "poche", "inhal",
|
||
# Termes chirurgicaux/cliniques FP
|
||
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
|
||
"gauche", "droit", "droite", "face", "profil",
|
||
# Faux positifs EDS supplémentaires
|
||
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
|
||
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
|
||
"10mg", "20mg", "40mg", "100mg", "300ui", "500ml", "innohep", "coaprovel",
|
||
"actiskenan", "simvastatine", "forlax",
|
||
# Mots temporels / contextuels détectés comme EDS_HOPITAL
|
||
"semaine", "jour", "matin", "soir", "nuit", "midi",
|
||
# Mots clés de contexte document
|
||
"compétences", "maladies", "inflammatoires", "systémiques", "rares",
|
||
"fret", "fax", "contexte", "résultat", "resultat", "résultats", "resultats",
|
||
"haute", "maison", "aide", "rpps", "poste", "fonct",
|
||
"sante", "santé", "etxe", "ttipi", "gastro", "concha",
|
||
"endoscopie", "endoscopique", "fibroscopie",
|
||
"indication", "conclusion", "technique", "anesthésie",
|
||
"digestif", "digestive", "digestives", "nutritive",
|
||
# Abréviations soins trackare détectées comme NOM (batch 20 OGC)
|
||
"soins", "lit", "jeun", "lever", "pose", "surv", "ggt", "vvp",
|
||
# Verbes d'instructions soins (aussi des patronymes INSEE → FP)
|
||
"coucher", "manger", "marcher", "sortir",
|
||
"verif", "crop", "evs", "maco", "pan", "cet", "trou", "nit", "nfs",
|
||
# Mots narratifs CRH capturés par fusion sidebar 2-colonnes
|
||
"evolution", "évolution", "explorations", "fermeture", "allergie", "allergies",
|
||
"lotissement", "cholangiographie", "cholecystectomie", "cholécystectomie",
|
||
"paracetamol", "paracétamol", "unité", "unite",
|
||
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
|
||
"glyc", "glycosurie", "vider", "forte",
|
||
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
|
||
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
|
||
"responsable", "autre", "autres", "autonome", "autonomes",
|
||
"préparations", "preparations", "prévenir", "prevenir",
|
||
"acétylsalicylique", "acetylsalicylique", "angio",
|
||
"desc", "diu", "barreau",
|
||
"haitz", "alde",
|
||
# FP audit OGC 21 — termes médicaux/courants flagués NOM_GLOBAL
|
||
"alimentation", "augmentation", "amelioration", "amélioration",
|
||
"biliaire", "biliaires", "bili", "voies", "voie",
|
||
"apyrexie", "apyréxie", "apyrétique", "apyretique",
|
||
"clavulanique", "mecillinam", "sulfamides", "sulfamide",
|
||
"tazobactam", "temocilline", "ecoflac", "furanes", "furane",
|
||
"exilar", "lipruzet", "mopral",
|
||
"sensible", "sensibles", "dossier", "dossiers",
|
||
"entero", "entéro", "medecine", "bio",
|
||
"aviation", "contention", "isolement",
|
||
"elimination", "élimination", "infectieux",
|
||
"hémodynamique", "hemodynamique", "pancréatite", "pancreatite",
|
||
"cholecystite", "cholécystite", "cholécystectomie", "cholecystectomie",
|
||
"appendicectomie", "néoplasie", "neoplasie",
|
||
"ovarienne", "prandial", "fébrile", "febrile",
|
||
"eupnéique", "eupneique", "normocarde", "normotendue",
|
||
"variable", "dosage", "posologie",
|
||
# Abréviations diététiques/soins trackare
|
||
"bcy", "po2", "po1", "po3", "bha", "atg", "ras", "cat", "ass",
|
||
# FP audit OGC 17 CRH
|
||
"mode", "retraitee", "retraité", "retraitée", "régression", "regression", "tel",
|
||
"strasbourg", "bordeaux", "toulouse", "paris", "lyon", "marseille", "bayonne", "anglet",
|
||
"saint-palais", "tarnos", "hendaye", "dax", "orthez", "oloron", "pau", "cambo",
|
||
# Spécialités/services récurrents comme FP NOM
|
||
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
||
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
||
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
|
||
"ophtalmologie", "stomatologie", "allergologie",
|
||
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
|
||
"orthopédie", "orthopedie", "traumatologie",
|
||
"palliatifs", "palliative", "palliatif",
|
||
"addictologie", "alcoologie", "tabacologie",
|
||
# FP soignants trackare (mots courants capturés par patterns Note d'évolution / Signé / Flacon)
|
||
"discussion", "echelle", "échelle", "scope", "tdm", "bouteille",
|
||
"evendol", "relais", "repas", "poursuite", "indication",
|
||
# FP pattern timestamp (termes ALL-CAPS capturés par "HH:MM NOM")
|
||
"eliminatin", "elimination", "élimination", "preremplie", "pré-remplie",
|
||
"thermie", "alim", "alimentation", "admin",
|
||
# Médicaments/tests labo capturés par patterns soignants
|
||
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
|
||
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
|
||
"ciprofloxacine", "lavement", "desinfection", "désinfection",
|
||
"avaler", "rachis", "lombaire", "thoraco-lombaire",
|
||
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
|
||
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
|
||
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
|
||
"faible", "fort", "forte",
|
||
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
|
||
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
|
||
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
|
||
"entree", "entrée", "continu", "continue",
|
||
"morphine", "claforan", "skenan", "actiskenan",
|
||
# Fragments de noms de médicaments (pdfplumber split)
|
||
"sium", "pegic", "fenid", "profenid",
|
||
# Catégories cliniques Trackare (en-têtes de section masqués à tort)
|
||
"respi", "respiratoire", "nephro", "cardio", "neuro", "onco", "pulmo",
|
||
"hemato", "hémato", "infectieux", "thermie", "diurese", "diurèse",
|
||
"transit", "anemie", "anémie", "constantes", "examen",
|
||
"post-op", "postop", "pré-op", "preop", "chimio", "elim",
|
||
"toilette", "sommeil", "hypota", "hypotension", "spo2",
|
||
"urine", "urines", "sng",
|
||
"rénale", "renale", "rénal", "renal", "cardiaque",
|
||
# Termes structurels trackare
|
||
"transmissions", "transmission", "releve", "relevé",
|
||
"objectif", "objectifs", "evaluation", "évaluation",
|
||
"planification", "planifié", "planifiee",
|
||
# ── FP détectés automatiquement par audit_fp_detector.py ──
|
||
# Lot 2 : tokens basse confiance (DICT_FR seul) clairement non-noms
|
||
"acide", "adulte", "ambulatoire", "avenue", "bandelette", "bassin",
|
||
"bijoux", "bouche", "bouchon", "changes", "court", "demande", "dessert",
|
||
"devenir", "diffusé", "douche", "entrée", "escarre", "espace",
|
||
"explications", "fauteuil", "feuillet", "fixateur", "fois", "gamma",
|
||
"germes", "glace", "habillage", "liste", "maquillage", "matelas",
|
||
"mettre", "obésité", "ongles", "palais", "perlant", "pertes",
|
||
"pièce", "plaie", "risque", "saint", "sang", "signe", "sonde",
|
||
"tenue", "texte", "transaminases", "transit", "transmis", "urinal",
|
||
"vernis", "vessie", "vrac",
|
||
# Lot 2 : termes médicaux (préfixes/suffixes)
|
||
"anatomo-pathologique", "anemie", "anémie", "angioscanner",
|
||
"cétonurie", "cetonurie", "depilation", "dépilation",
|
||
"folique", "gastroentérologue", "gastroenterologue",
|
||
"microgrammes", "nalidixique", "naso-gastrique",
|
||
"angio-irm", "neuro", "neuro-chirurgie", "endoplasmique",
|
||
"cyto", "plaie-colle", "bionolyte",
|
||
# Lot 1 (103 tokens, confiance >= 0.5) ──
|
||
# Anatomie / clinique
|
||
"abdomen", "aortique", "bilirubine", "drain", "gastrique", "histologique",
|
||
"intraveineuse", "lithiasique", "macroscopie", "miction", "molles", "pelvienne",
|
||
"plaquette", "plaquettes", "rectale", "sanguine", "sciatique", "selle", "urinaire",
|
||
# Pathologies / symptômes
|
||
"algie", "angoisse", "antécédent", "douloureuse", "dyslipidémie",
|
||
"hemodialyse", "hemorragique", "hyperthermie", "hématologue",
|
||
# Médicaments / matériel médical
|
||
"ampoule", "antalgique", "antiseptique", "compresse", "flacon",
|
||
"oxygène", "pansement", "vitamine",
|
||
# Biologie / examens
|
||
"biochimie", "biologie", "fer",
|
||
# Actions / états cliniques
|
||
"ablation", "absence", "admission", "bloc", "changement", "cliniquement",
|
||
"cognitif", "couchage", "dispositif", "dynamique", "entretien", "histoire",
|
||
"intervention", "position", "rappel", "relation", "retour", "réalisation",
|
||
"résistant", "réévaluation", "situation", "temporaire", "urgence", "urgences",
|
||
"urgent", "validation",
|
||
# Mots courants / contextuels
|
||
"angle", "bille", "boisson", "bureau", "cases", "circuit",
|
||
"concubin", "confortable", "demain", "densité", "dernière",
|
||
"distant", "domaine", "elle", "fils", "frère", "grand", "horizon",
|
||
"hui", "identifiant", "minuit", "murent", "neuf", "original", "pages",
|
||
"personne", "premier", "quartier", "retraite", "route", "rés",
|
||
"trouve", "verrouillé", "villa", "étage",
|
||
# Termes médicaux courants faussement détectés comme NOM (Phase 2 audit mars 2026)
|
||
"ains", "ponction", "hanche", "burkitt", "orl", "gds", "oap", "tvp", "epp",
|
||
"bronchite", "accueil", "cadre", "transfert", "relecture", "examens",
|
||
"traitements", "traitement", "infectiologie", "cancérologie", "cancerologie",
|
||
"maternité", "orale", "sachet", "absence",
|
||
# FP audit 30 fichiers Phase 2 (mars 2026)
|
||
"bouffee", "bouffée", "discontinue", "respimat", "lyoc",
|
||
"probnp", "pro-bnp", "nt-probnp",
|
||
"bpco", "colle", "gsc", "masse",
|
||
"selle", "selles",
|
||
# Acronymes médicaux courts (3 lettres) souvent FP comme NOM
|
||
"epo", "irc", "sib", "inr", "iec", "ira", "ait", "avc",
|
||
"imc", "ipp", "ivo", "amp", "ivg", "img", "had", "ssr",
|
||
"hta", "ecg", "irm", "tep", "crp", "nfs", "bhc", "vgm",
|
||
"vni", "aeg", "bas", "snv", "hba", "ide", "dci",
|
||
# Termes pharmaceutiques FP comme NOM (audit 30 fichiers mars 2026)
|
||
"buvable", "buvables", "nominal", "nominaux",
|
||
"acide", "principale", "principal", "principaux",
|
||
"hyaluronique", "valproique", "valproïque", "tranexamique", "tranéxamique",
|
||
"clavulanique", "nalidixique",
|
||
"grancher", # Centre de réadaptation (nom d'établissement homonyme)
|
||
"experf", # Prestataire HAD (nom commercial homonyme)
|
||
# Noms de services hospitaliers (FP comme [NOM])
|
||
"ortho", "mobile", "polyvalente", "polyvalent",
|
||
"geriatrie", "gériatrie", "ambulatoire", "provisoire",
|
||
"intraveineuse", "intraveineux", "sous-cutanee", "sous-cutané",
|
||
# Noms de services hospitaliers (aussi patronymes INSEE → FP récurrents)
|
||
"viscerale", "viscérale", "vasculaire", "vasculaires",
|
||
"conventionnelle", "conventionnel",
|
||
"polyvalente", "polyvalent",
|
||
"infectieuse", "infectieuses",
|
||
# Termes soins infirmiers / activités de la vie quotidienne (FP trackare doc 216)
|
||
"aide", "partielle", "partiel", "complete", "complète", "complet",
|
||
"contention", "lavabo", "blader", "scan", "post", "lunettes",
|
||
"deshabillage", "déshabillage", "habillage",
|
||
"surveillance", "surv", "refection", "réfection",
|
||
"miction", "toilette", "douche", "changes",
|
||
"installation", "transfert", "mobilisation",
|
||
"alimentation", "hydratation", "collation",
|
||
"stimulation", "prevention", "prévention",
|
||
# Termes pharmaceutiques/matériel médical FP (retour relecteur 2026-03-16)
|
||
"chlorure",
|
||
# Dispositifs médicaux (FP "OXYGENE LUNETTES" → [NOM])
|
||
"canule", "canules", "masque", "sonde", "sondes",
|
||
# Termes chirurgicaux FP comme [NOM] (retour relecteur 2026-03-17)
|
||
"totale", "total", "partielle", "partiel",
|
||
"prothese", "prothèse", "protheses", "prothèses", "unicompartimentale",
|
||
# Antiseptiques / produits de soins (FP trackare prescriptions)
|
||
"betascrub", "hibiscrub", "betadine", "biseptine", "chlorhexidine",
|
||
# Nutrition entérale / compléments
|
||
"fresubin", "nutrison", "sondalis", "isosource", "novasource",
|
||
# Termes médicaux FP dans bactério / texte libre
|
||
"nombreuses", "nombreux", "plusieurs", "quelques",
|
||
"internationale", "international",
|
||
"resorbable", "résorbable", "resorbables", "résorbables",
|
||
"alfa", "capsule", "capsules",
|
||
}
|
||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||
|
||
# Enrichissement depuis fichier externe (modifiable sans toucher au code)
|
||
_stopwords_file = Path(__file__).parent / "data" / "stopwords_manuels.txt"
|
||
if _stopwords_file.exists():
|
||
try:
|
||
_sw_count = 0
|
||
for _line in _stopwords_file.read_text(encoding="utf-8").splitlines():
|
||
_w = _line.strip()
|
||
if _w and not _w.startswith("#"):
|
||
_MEDICAL_STOP_WORDS_SET.add(_w)
|
||
_sw_count += 1
|
||
log.info("Stop-words manuels chargés : %d mots depuis %s", _sw_count, _stopwords_file.name)
|
||
except Exception as _exc:
|
||
log.error("Stop-words manuels : erreur de lecture %s — %s", _stopwords_file, _exc)
|
||
else:
|
||
log.warning("Stop-words manuels : fichier introuvable %s — qualité dégradée", _stopwords_file)
|
||
|
||
# Enrichissement BDPM : ~7300 noms commerciaux + DCI/substances actives
|
||
_bdpm_path = Path(__file__).parent / "data" / "bdpm" / "medicaments_stopwords.txt"
|
||
if _bdpm_path.exists():
|
||
try:
|
||
_bdpm_count = 0
|
||
for _line in _bdpm_path.read_text(encoding="utf-8").splitlines():
|
||
_w = _line.strip()
|
||
if _w and not _w.startswith("#"):
|
||
_MEDICAL_STOP_WORDS_SET.add(_w)
|
||
_bdpm_count += 1
|
||
log.info("BDPM stop-words chargés : %d mots", _bdpm_count)
|
||
except Exception as _exc:
|
||
log.error("BDPM stop-words : erreur de lecture %s — %s", _bdpm_path, _exc)
|
||
else:
|
||
log.warning("BDPM stop-words : fichier introuvable %s — qualité dégradée", _bdpm_path)
|
||
|
||
_MEDICAL_STOP_WORDS = (
|
||
r"(?:" + "|".join(re.escape(w) for w in _MEDICAL_STOP_WORDS_SET) + r")"
|
||
)
|
||
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
||
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||
RE_PERSON_CONTEXT = re.compile(
|
||
r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
|
||
r"|\bNom[ \t]*:[ \t]*"
|
||
r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
|
||
r")[ \t]+)"
|
||
rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
|
||
)
|
||
|
||
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
|
||
RE_DR_COMMA_LIST = re.compile(
|
||
r"(?:Dr\.?|DR\.?|Docteur)\s+"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+"
|
||
r"(?:\s*,\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+)+",
|
||
re.IGNORECASE,
|
||
)
|
||
# Token nom : mot commençant par une majuscule d'au moins 3 lettres
|
||
_NAME_TOKEN_RE = re.compile(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']{2,}")
|
||
SPLITTER = re.compile(r"\s*[:|;\t]\s*")
|
||
|
||
# --- Extraction globale de noms depuis champs structurés ---
|
||
_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||
RE_EXTRACT_PATIENT = re.compile(
|
||
r"Patient\(?e?\)?\s*:\s*"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)"
|
||
r"(?=\s+Né|\s+né|\s+N°|\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
# Champs d'identité structurés (documents trackare / DPI)
|
||
RE_EXTRACT_NOM_NAISSANCE = re.compile(
|
||
r"Nom\s+de\s+naissance\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+IPP|\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_NOM_PRENOM = re.compile(
|
||
r"Nom\s+et\s+Pr[ée]nom\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+Date|\s+Né|\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_NOM_UTILISE = re.compile(
|
||
r"Nom\s+utilis[ée]\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_PRENOM = re.compile(
|
||
r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_LIEU_NAISSANCE = re.compile(
|
||
r"Lieu\s+de\s+naissance\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
RE_EXTRACT_VILLE_RESIDENCE = re.compile(
|
||
r"Ville\s+de\s+r[ée]sidence\s*:\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)",
|
||
re.MULTILINE,
|
||
)
|
||
# Contacts structurés : Conjoint/Concubin/Epoux/Epouse/Parent + NOM PRENOM
|
||
RE_EXTRACT_CONTACT = re.compile(
|
||
r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur)\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
|
||
)
|
||
RE_EXTRACT_REDIGE = re.compile(
|
||
r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
)
|
||
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
|
||
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
|
||
RE_EXTRACT_MME_MR = re.compile(
|
||
r"(?:MMES|MME|Mmes|Mme|Madame|Mesdames|Monsieur|Messieurs|Mrs|Mr\.?)\s+"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,4}})",
|
||
)
|
||
# Listes virgulées après civilité : "Mmes Anorga, Goyenaga, Martinez et Murcy"
|
||
_CNAME = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']"
|
||
RE_CIVILITE_COMMA_LIST = re.compile(
|
||
r"(?:Mmes|Mme|Mesdames|Mrs|Mr|Messieurs|Monsieur|Madame|Dr\.?|Docteur)\s+"
|
||
+ _CNAME + r"+"
|
||
+ r"(?:\s*,\s*" + _CNAME + r"+)*"
|
||
+ r"(?:\s*,?\s*\bet\s+" + _CNAME + r"+)?",
|
||
re.IGNORECASE,
|
||
)
|
||
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
||
RE_EXTRACT_DR_DEST = re.compile(
|
||
r"\b(?:DR\.?|Dr\.?|Docteur)[ \t]+"
|
||
+ _INITIAL_OPT +
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
)
|
||
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
|
||
RE_EXTRACT_STAFF_ROLE = re.compile(
|
||
r"\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier"
|
||
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b[ \t]*:?[ \t]*"
|
||
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
|
||
)
|
||
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
|
||
RE_EXTRACT_PR = re.compile(
|
||
r"(?:Pr\.?|Professeur)[ \t]+"
|
||
+ _INITIAL_OPT +
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
)
|
||
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
|
||
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
|
||
RE_EXTRACT_OPERATEUR = re.compile(
|
||
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
|
||
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
|
||
+ _INITIAL_OPT +
|
||
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
|
||
)
|
||
# En-tête "Courrier Epi - NOM, PRENOM" (lettres de sortie)
|
||
RE_EXTRACT_COURRIER = re.compile(
|
||
r"Courrier\s+(?:Epi|Ep[ée]ph[ée]m[eé]ride|Hospit)\s*[\-–]\s*"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:\s*,\s*(?:{_UC_NAME_TOKEN}))*)",
|
||
)
|
||
# "CABINET ETXEBARNONDOA", "Cabinet Médical DUPONT", "CABINET MEDICAL DU DR MACHIN"
|
||
RE_EXTRACT_CABINET = re.compile(
|
||
r"\bCABINET\s+(?:M[ÉEe]DICAL\s+)?(?:DU\s+)?(?:DR\.?\s+)?"
|
||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||
re.IGNORECASE,
|
||
)
|
||
# Téléphone avec extension slash : 05.59.44.38.32/34
|
||
RE_TEL_SLASH = re.compile(
|
||
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
|
||
)
|
||
|
||
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||
|
||
# --- Mr/Mme + initiale isolée (ex: "Mme Z", "Mr R") ---
|
||
RE_CIVILITE_INITIALE = re.compile(
|
||
r"\b((?:Mme|MME|Madame|Monsieur|Mr\.?|M\.)\s+)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])(?=[\s,.\-\)\]:;!?]|$)"
|
||
)
|
||
|
||
# --- N° examen / N° patient imagerie (radiologie) ---
|
||
RE_NUM_EXAMEN_PATIENT = re.compile(
|
||
r"N[°o]?\s*(?:d['''\u2019]\s*)?(?:examen|patient|accession|passage)\s*[:\-]?\s*([A-Za-z]{0,4}\d{5,12})",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
# --- Adresses lieu-dit / maison basque / lotissement ---
|
||
RE_ADRESSE_LIEU_DIT = re.compile(
|
||
r"\b(?:MAISON|LOT|LOTISSEMENT|RESIDENCE|RÉSIDENCE|MAS|LIEU[\s\-]DIT|DOMAINE|HAMEAU|QUARTIER)\s+"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']{2,}"
|
||
r"(?:\s+\d{1,4})?",
|
||
re.IGNORECASE,
|
||
)
|
||
# Lieux-dits courants en France (sur ligne seule = adresse)
|
||
RE_LIEU_DIT_SEUL = re.compile(
|
||
r"^[ \t]*(Le\s+Bourg|Le\s+Village|Le\s+Hameau|Le\s+Château|Le\s+Moulin|La\s+Place|Le\s+Clos)[ \t]*$",
|
||
re.IGNORECASE | re.MULTILINE,
|
||
)
|
||
|
||
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
||
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
||
RE_DATE_NAISSANCE = re.compile(
|
||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_DATE = re.compile(
|
||
r"\b(\d{1,2})\s*[/.\-]\s*(\d{1,2})\s*[/.\-]\s*(\d{4})\b"
|
||
r"|"
|
||
r"\b(\d{1,2})\s+" + _MOIS_FR + r"\s+(\d{4})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_ADRESSE = re.compile(
|
||
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
|
||
r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence"
|
||
r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)"
|
||
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_CODE_POSTAL = re.compile(
|
||
r"(?:(?:[Cc]ode\s*[Pp]ostal|CP)\s*[:\-]?\s*(\d{5}))"
|
||
r"|"
|
||
# 5 chiffres + nom de ville (Title Case ou MAJUSCULES), pas précédé d'un chiffre (évite RPPS)
|
||
# Exclure les unités médicales (UI, mg, ml, etc.) via negative lookahead
|
||
r"(?:(?<!\d)(\d{5})[ \t]+(?!UI\b|mg\b|ml\b|µg\b)[A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+"
|
||
r"(?:[\s\-][A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+)*"
|
||
r"(?:\s+CEDEX)?)",
|
||
)
|
||
RE_BP = re.compile(
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\.\-]+\s+)?BP\s+\d+",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_AGE = re.compile(
|
||
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+(?:de\s+)?|(?:,\s*|\(\s*)"
|
||
r")(\d{1,3})\s*(?:ans|A)\b",
|
||
re.IGNORECASE,
|
||
)
|
||
# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
|
||
_ETAB_NAME = (r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)")
|
||
RE_ETABLISSEMENT = re.compile(
|
||
r"\b("
|
||
# Sigles longs : acceptés seuls ou avec nom
|
||
r"(?:EHPAD|SSR/USLD|SSR|USLD|HAD|CSAPA|CMPP|CMP|UGA|CHRU|CHU|HIA|CLCC|GHT|GCS)"
|
||
+ _ETAB_NAME + r"*"
|
||
r"|"
|
||
# Sigles courts (CH, CHS) : obligent un nom après pour éviter les faux positifs
|
||
r"(?:CHS|CH)" + _ETAB_NAME + r"+"
|
||
r")",
|
||
)
|
||
RE_HOPITAL_VILLE = re.compile(
|
||
r"(?<![Ee]xamen )"
|
||
# Type d'établissement : case-insensitive sur le groupe (?i:...) pour capturer
|
||
# aussi bien "Centre Hospitalier" que "CENTRE HOSPITALIER" (documents ALL-CAPS).
|
||
r"\b((?i:hôpital|clinique|polyclinique|centre\s+hospitalier"
|
||
r"|centre\s+médical|centre\s+de\s+soins|maison\s+de\s+santé"
|
||
r"|maison\s+de\s+retraite|résidence|foyer|pharmacie)"
|
||
# Déterminants : case-insensitive aussi (de, DE, De… du, DU…).
|
||
r"\s+(?i:de\s+|d['']\s*|du\s+|des\s+)?(?i:la\s+|le\s+|l['']\s*|les\s+)?"
|
||
# Nom propre : toujours commence par une majuscule, queue accepte mélange.
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||
r"(?:\s+(?i:de\s+|d['']\s*|du\s+|des\s+)?(?i:la\s+|le\s+|l['']\s*|les\s+)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||
)
|
||
RE_SERVICE = re.compile(
|
||
r"\b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement)\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||
)
|
||
RE_NUMERO_DOSSIER = re.compile(
|
||
r"(?:\bdossier|\bn°\s*dossier|\bNDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||
r"|"
|
||
r"(?:\bréférence|\bréf\.)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})",
|
||
re.IGNORECASE,
|
||
)
|
||
RE_EPISODE = re.compile(
|
||
r"N°\s*[ÉéEe]pisode\s*[:\-]?\s*([A-Za-z0-9\-]{4,})"
|
||
r"|"
|
||
r"[ÉéEe]pisode\s*N[o°.]?\s*\.?\s*:?\s*(\d{5,})",
|
||
re.IGNORECASE,
|
||
)
|
||
# N° venue (BACTERIO), N° séjour — identifiants de séjour hospitalier
|
||
RE_VENUE_SEJOUR = re.compile(
|
||
r"(?:N[°o]?\s*venue|N[°o]?\s*séjour|N[°o]?\s*de\s+séjour"
|
||
r"|num[ée]ro\s+de\s+(?:venue|séjour))\s*[:\-]?\s*(\d{4,})",
|
||
re.IGNORECASE,
|
||
)
|
||
|
||
@dataclass
|
||
class PiiHit:
|
||
page: int
|
||
kind: str
|
||
original: str
|
||
placeholder: str
|
||
bbox_hint: Optional[Tuple[float, float, float, float]] = None
|
||
|
||
@dataclass
|
||
class AnonResult:
|
||
text_out: str
|
||
tables_block: str
|
||
audit: List[PiiHit] = field(default_factory=list)
|
||
is_trackare: bool = False
|
||
|
||
@dataclass
|
||
class NerDetection:
|
||
"""Détection NER sur le texte original (non masqué).
|
||
Utilisé par l'architecture NER-first pour la validation croisée des noms."""
|
||
token: str
|
||
label: str # NOM, PRENOM, HOPITAL, VILLE, LOC, ORG
|
||
score: float
|
||
page_idx: int
|
||
source: str # "eds_pseudo", "gliner", "camembert_bio"
|
||
|
||
@dataclass
|
||
class NameCandidate:
|
||
"""Candidat de nom extrait par regex avec métadonnées de confiance.
|
||
Utilisé pour la validation croisée NER-first."""
|
||
token: str
|
||
source: str # regex ou champ qui a capturé ce nom
|
||
context_strength: str # "high", "medium", "low"
|
||
bypass_stopwords: bool # ce que force_names signifie actuellement
|
||
|
||
# ----------------- Config loader -----------------
|
||
|
||
# Whitelist absolue : tokens que l'utilisateur a déclarés "à ne JAMAIS masquer"
|
||
# Alimenté par cfg["whitelist_phrases"] dans load_dictionaries().
|
||
# Filtré à 2 niveaux : pré-masquage (_apply_extracted_names) et filtrage final
|
||
# (avant redact_pdf_vector) pour neutraliser tout NOM/PER/ORG qui matcherait.
|
||
_WHITELIST_NEVER_MASK_TOKENS: set = set()
|
||
_WHITELIST_NEVER_MASK_PHRASES: set = set()
|
||
|
||
# Safe-guards pour les défauts intégrés quand les fichiers data/*.txt sont absents
|
||
# (mode frozen où le bundle aurait omis de les inclure). Contenu minimal pour
|
||
# garantir un comportement de masquage correct même en mode dégradé.
|
||
_DPI_LABELS_FALLBACK = {
|
||
"date", "note", "heure", "type", "soin", "soins", "surv",
|
||
"page", "presc", "saint", "sainte",
|
||
}
|
||
_COMPANION_BLACKLIST_FALLBACK = {
|
||
"CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
|
||
"CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
|
||
"MEDECINE", "DOSSIER", "CONTENTION", "ISOLEMENT", "ELIMINATION",
|
||
"ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE",
|
||
}
|
||
|
||
|
||
def _load_txt_set(path: Path, transform=str.lower, label: str = "file") -> set:
|
||
"""Charge un fichier .txt ligne par ligne. Robuste aux erreurs (frozen exe)."""
|
||
result: set = set()
|
||
if not path.exists():
|
||
log.warning("%s introuvable : %s — utilisation des défauts intégrés", label, path)
|
||
return result
|
||
try:
|
||
for _line in path.read_text(encoding="utf-8").splitlines():
|
||
_w = _line.strip()
|
||
if _w and not _w.startswith("#"):
|
||
result.add(transform(_w))
|
||
log.info("%s chargé : %d entrées depuis %s", label, len(result), path.name)
|
||
except Exception as exc:
|
||
log.error("%s : erreur de lecture %s — %s", label, path, exc)
|
||
return result
|
||
|
||
|
||
# Labels DPI structurels à ne JAMAIS masquer comme noms (Date, Note, Heure...)
|
||
# Stocké en LOWERCASE — la comparaison est case-insensitive.
|
||
# Chargé depuis data/dpi_labels_blacklist.txt + cfg["additional_dpi_labels"].
|
||
_DPI_LABELS_SET: set = _load_txt_set(
|
||
Path(__file__).parent / "data" / "dpi_labels_blacklist.txt",
|
||
transform=str.lower,
|
||
label="DPI labels blacklist",
|
||
)
|
||
if not _DPI_LABELS_SET:
|
||
_DPI_LABELS_SET = set(_DPI_LABELS_FALLBACK)
|
||
|
||
# Companion blacklist : termes EN MAJUSCULES qui ne sont JAMAIS des noms
|
||
# (spécialités, labos pharma, mots courants ambigus).
|
||
# Stocké en UPPERCASE — la comparaison est faite contre des candidats déjà uppercase.
|
||
# Chargé depuis data/companion_blacklist.txt + cfg["additional_companion_blacklist"].
|
||
_COMPANION_BLACKLIST_SET: set = _load_txt_set(
|
||
Path(__file__).parent / "data" / "companion_blacklist.txt",
|
||
transform=str.upper,
|
||
label="Companion blacklist",
|
||
)
|
||
if not _COMPANION_BLACKLIST_SET:
|
||
_COMPANION_BLACKLIST_SET = set(_COMPANION_BLACKLIST_FALLBACK)
|
||
|
||
|
||
_WHITELIST_FUNCTION_WORDS = {
|
||
"de", "du", "des", "le", "la", "les", "et", "ou", "à", "a",
|
||
"en", "un", "une", "au", "aux", "of", "the", "and",
|
||
}
|
||
|
||
|
||
def _load_whitelist_phrases(phrases) -> int:
|
||
"""Tokenise les phrases whitelist et alimente les sets globaux.
|
||
Retourne le nombre de tokens effectivement ajoutés.
|
||
Les mots-outils (de, du, le...) ne sont pas indexés individuellement
|
||
pour éviter de faux blocages, mais la phrase complète est conservée."""
|
||
if not phrases:
|
||
return 0
|
||
added = 0
|
||
for phrase in phrases:
|
||
if not phrase or not str(phrase).strip():
|
||
continue
|
||
p = str(phrase).strip()
|
||
_WHITELIST_NEVER_MASK_PHRASES.add(p.lower())
|
||
for tok in re.split(r"[\s\-']+", p):
|
||
tok = tok.strip(" .,;:!?()[]{}\"'«»")
|
||
tok_lower = tok.lower()
|
||
if len(tok) < 3:
|
||
continue
|
||
if tok_lower in _WHITELIST_FUNCTION_WORDS:
|
||
continue
|
||
_WHITELIST_NEVER_MASK_TOKENS.add(tok_lower)
|
||
added += 1
|
||
return added
|
||
|
||
|
||
def load_dictionaries(config_path: Optional[Path]) -> Dict[str, Any]:
|
||
cfg = DEFAULTS_CFG.copy()
|
||
if config_path and config_path.exists() and yaml is not None:
|
||
try:
|
||
user = yaml.safe_load(config_path.read_text(encoding="utf-8")) or {}
|
||
for k, v in user.items():
|
||
cfg[k] = v
|
||
except Exception:
|
||
pass
|
||
|
||
# Charger les stop-words et villes supplémentaires depuis le YAML
|
||
extra_sw = cfg.get("additional_stopwords", [])
|
||
if extra_sw:
|
||
for w in extra_sw:
|
||
if w and str(w).strip():
|
||
_MEDICAL_STOP_WORDS_SET.add(str(w).strip().lower())
|
||
log.info("Stop-words YAML supplémentaires : %d", len(extra_sw))
|
||
|
||
extra_villes = cfg.get("additional_villes_blacklist", [])
|
||
if extra_villes:
|
||
for v in extra_villes:
|
||
if v and str(v).strip():
|
||
_VILLE_BLACKLIST.add(str(v).strip().upper())
|
||
log.info("Villes blacklist YAML supplémentaires : %d", len(extra_villes))
|
||
|
||
# Whitelist absolue : termes/phrases que l'utilisateur a déclarés "à ne JAMAIS masquer"
|
||
# Alimentée par la GUI v5.4 (clé racine whitelist_phrases du YAML).
|
||
wl_phrases = cfg.get("whitelist_phrases", []) or []
|
||
if wl_phrases:
|
||
n_added = _load_whitelist_phrases(wl_phrases)
|
||
log.info("Whitelist phrases chargées : %d phrases (%d tokens)",
|
||
len(wl_phrases), n_added)
|
||
|
||
# DPI labels supplémentaires (clé YAML additional_dpi_labels)
|
||
extra_dpi = cfg.get("additional_dpi_labels", []) or []
|
||
if extra_dpi:
|
||
for w in extra_dpi:
|
||
if w and str(w).strip():
|
||
_DPI_LABELS_SET.add(str(w).strip().lower())
|
||
log.info("DPI labels YAML supplémentaires : %d", len(extra_dpi))
|
||
|
||
# Companion blacklist supplémentaire (clé YAML additional_companion_blacklist)
|
||
extra_comp = cfg.get("additional_companion_blacklist", []) or []
|
||
if extra_comp:
|
||
for w in extra_comp:
|
||
if w and str(w).strip():
|
||
_COMPANION_BLACKLIST_SET.add(str(w).strip().upper())
|
||
log.info("Companion blacklist YAML supplémentaire : %d", len(extra_comp))
|
||
|
||
return cfg
|
||
|
||
# ----------------- Extraction -----------------
|
||
|
||
_doctr_model_cache = None
|
||
|
||
def _get_doctr_model():
|
||
global _doctr_model_cache
|
||
if _doctr_model_cache is None:
|
||
_doctr_model_cache = _doctr_ocr_predictor(
|
||
det_arch="db_resnet50", reco_arch="crnn_vgg16_bn", pretrained=True
|
||
)
|
||
return _doctr_model_cache
|
||
|
||
def _extract_page_layout_aware(page) -> str:
|
||
"""Extrait le texte d'une page PyMuPDF en gérant les layouts multi-colonnes.
|
||
|
||
Détecte si la page a un sidebar/colonne gauche parallèle à un corps droit
|
||
(typique des CRH/CRO hospitaliers). Si oui, lit chaque colonne séparément
|
||
pour éviter l'entrelacement du texte.
|
||
"""
|
||
blocks = page.get_text("blocks")
|
||
text_blocks = []
|
||
for b in blocks:
|
||
x0, y0, x1, y1, text, _block_no, block_type = b
|
||
if block_type == 0 and text.strip():
|
||
text_blocks.append((x0, y0, x1, y1, text.strip()))
|
||
if not text_blocks:
|
||
return ""
|
||
|
||
page_w = page.rect.width
|
||
page_h = page.rect.height
|
||
|
||
# --- Détection de colonnes ---
|
||
# Cherche une ligne verticale split_x qui sépare les blocs en deux groupes
|
||
# parallèles (chevauchement vertical significatif).
|
||
best_split = None
|
||
best_score = -1
|
||
for split_x in range(int(page_w * 0.15), int(page_w * 0.45), 3):
|
||
left = [b for b in text_blocks if b[2] <= split_x + 5]
|
||
right = [b for b in text_blocks if b[0] >= split_x - 5]
|
||
crossing = [b for b in text_blocks if b[0] < split_x - 5 and b[2] > split_x + 5]
|
||
if len(left) < 3 or len(right) < 3:
|
||
continue
|
||
left_span = max(b[3] for b in left) - min(b[1] for b in left)
|
||
right_span = max(b[3] for b in right) - min(b[1] for b in right)
|
||
if left_span < page_h * 0.25 or right_span < page_h * 0.25:
|
||
continue
|
||
overlap_min = max(min(b[1] for b in left), min(b[1] for b in right))
|
||
overlap_max = min(max(b[3] for b in left), max(b[3] for b in right))
|
||
if overlap_max - overlap_min < page_h * 0.15:
|
||
continue
|
||
score = len(left) + len(right) - 5 * len(crossing)
|
||
if score > best_score:
|
||
best_score = score
|
||
best_split = split_x
|
||
|
||
if best_split is not None:
|
||
left_blocks = sorted(
|
||
[b for b in text_blocks if b[2] <= best_split + 5], key=lambda b: b[1]
|
||
)
|
||
right_blocks = sorted(
|
||
[b for b in text_blocks if b[0] >= best_split - 5], key=lambda b: b[1]
|
||
)
|
||
full_width = sorted(
|
||
[b for b in text_blocks if b[0] < best_split - 5 and b[2] > best_split + 5],
|
||
key=lambda b: b[1],
|
||
)
|
||
col_start_y = min(
|
||
min((b[1] for b in left_blocks), default=page_h),
|
||
min((b[1] for b in right_blocks), default=page_h),
|
||
)
|
||
headers = [b for b in full_width if b[1] < col_start_y + 5]
|
||
footers = [b for b in full_width if b[1] >= col_start_y + 5]
|
||
parts = []
|
||
for b in headers:
|
||
parts.append(b[4])
|
||
for b in left_blocks:
|
||
parts.append(b[4])
|
||
for b in right_blocks:
|
||
parts.append(b[4])
|
||
for b in footers:
|
||
parts.append(b[4])
|
||
return "\n".join(parts)
|
||
else:
|
||
sorted_blocks = sorted(text_blocks, key=lambda b: (b[1], b[0]))
|
||
return "\n".join(b[4] for b in sorted_blocks)
|
||
|
||
|
||
def extract_text_with_fallback_ocr(pdf_path: Path) -> Tuple[List[str], List[List[str]], bool, OcrWordMap]:
|
||
"""Extraction texte multi-passes avec fallback OCR (docTR).
|
||
Retourne (pages_text, tables_lines, ocr_used, ocr_word_map).
|
||
|
||
Passe 1 : PyMuPDF layout-aware (blocs avec détection de colonnes)
|
||
Passe 1b: pdfplumber si PyMuPDF échoue ou donne peu de texte
|
||
Passe 2 : pdfminer si CID ou texte pauvre
|
||
Passe 3 : OCR docTR si PDF scanné (très peu de texte)
|
||
Tables : toujours extraites via pdfplumber (indépendamment du texte).
|
||
"""
|
||
pages_text: List[str] = []
|
||
tables_lines: List[List[str]] = []
|
||
ocr_used = False
|
||
|
||
# --- Tables : toujours via pdfplumber ---
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
for p in pdf.pages:
|
||
rows: List[str] = []
|
||
try:
|
||
tables = p.extract_tables()
|
||
for tbl in tables or []:
|
||
for row in tbl:
|
||
clean = [c if c is not None else "" for c in row]
|
||
rows.append("\t".join(clean).strip())
|
||
except Exception:
|
||
pass
|
||
tables_lines.append(rows)
|
||
|
||
# --- Passe 1 : PyMuPDF layout-aware (détection multi-colonnes) ---
|
||
if fitz is not None:
|
||
try:
|
||
doc = fitz.open(str(pdf_path))
|
||
pages_text = [_extract_page_layout_aware(doc[i]) for i in range(len(doc))]
|
||
doc.close()
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Passe 1b : pdfplumber si PyMuPDF n'a rien donné ---
|
||
total_chars = sum(len(x or "") for x in pages_text)
|
||
if total_chars < 500:
|
||
try:
|
||
with pdfplumber.open(pdf_path) as pdf:
|
||
pp_pages = [p.extract_text(x_tolerance=2.5, y_tolerance=4.0) or "" for p in pdf.pages]
|
||
if sum(len(x) for x in pp_pages) > total_chars:
|
||
pages_text = pp_pages
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Passe 2 : pdfminer si CID ou texte pauvre ---
|
||
total_chars = sum(len(x or "") for x in pages_text)
|
||
need_fallback = total_chars < 500
|
||
if not need_fallback:
|
||
need_fallback = any(CID_PATTERN.search(x or "") for x in pages_text)
|
||
if need_fallback:
|
||
try:
|
||
text_all = pdfminer_extract_text(
|
||
str(pdf_path),
|
||
laparams=LAParams(char_margin=2.0, word_margin=0.1, line_margin=0.8, boxes_flow=0.5),
|
||
)
|
||
split = [x for x in text_all.split("\f") if x]
|
||
if split and sum(len(x) for x in split) > total_chars:
|
||
pages_text = split
|
||
except Exception:
|
||
pass
|
||
|
||
# --- Passe 3 : OCR docTR sur les pages pauvres en texte ---
|
||
# Pas de seuil global : on OCR uniquement les pages individuelles
|
||
# qui ont peu de texte (< 150 chars), puis on garde le meilleur résultat
|
||
# par page. Les pages déjà riches en texte ne sont pas touchées.
|
||
_OCR_PAGE_THRESHOLD = 150 # chars minimum pour considérer une page comme "texte OK"
|
||
total_chars = sum(len(x or "") for x in pages_text)
|
||
ocr_word_map: OcrWordMap = {}
|
||
sparse_pages = [i for i, p in enumerate(pages_text) if len(p or "") < _OCR_PAGE_THRESHOLD]
|
||
if sparse_pages and _DOCTR_AVAILABLE and fitz is not None:
|
||
try:
|
||
model = _get_doctr_model()
|
||
doc = fitz.open(str(pdf_path))
|
||
import numpy as np
|
||
ocr_replaced = 0
|
||
for i in sparse_pages:
|
||
if i >= len(doc):
|
||
continue
|
||
pix = doc[i].get_pixmap(dpi=300)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
result = model([np.array(img)])
|
||
page_text = ""
|
||
page_words: List[Tuple[str, float, float, float, float]] = []
|
||
for block in result.pages[0].blocks:
|
||
for line in block.lines:
|
||
for w in line.words:
|
||
(x0, y0), (x1, y1) = w.geometry
|
||
page_words.append((w.value, x0, y0, x1, y1))
|
||
page_text += " ".join(w.value for w in line.words) + "\n"
|
||
# Remplacer seulement si l'OCR produit plus de texte
|
||
if len(page_text) > len(pages_text[i] or ""):
|
||
pages_text[i] = page_text
|
||
ocr_word_map[i] = page_words
|
||
ocr_replaced += 1
|
||
doc.close()
|
||
if ocr_replaced > 0:
|
||
ocr_used = True
|
||
log.info("OCR docTR : %d/%d pages remplacées", ocr_replaced, len(sparse_pages))
|
||
except Exception as e:
|
||
log.warning("OCR docTR échoué : %s", e)
|
||
ocr_word_map = {}
|
||
return pages_text, tables_lines, ocr_used, ocr_word_map
|
||
|
||
|
||
# Alias pour compatibilité ascendante
|
||
def extract_text_three_passes(pdf_path: Path):
|
||
pages_text, tables_lines, _, _ = extract_text_with_fallback_ocr(pdf_path)
|
||
return pages_text, tables_lines
|
||
|
||
# ----------------- Helpers -----------------
|
||
|
||
def _compile_user_regex(pattern: str, flags_list: List[str]):
|
||
flags = 0
|
||
for f in flags_list or []:
|
||
u = f.upper()
|
||
if u == "IGNORECASE": flags |= re.IGNORECASE
|
||
if u == "MULTILINE": flags |= re.MULTILINE
|
||
if u == "DOTALL": flags |= re.DOTALL
|
||
return re.compile(pattern, flags)
|
||
|
||
|
||
def _apply_overrides(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||
for ov in cfg.get("regex_overrides", []) or []:
|
||
pattern = ov.get("pattern"); placeholder = ov.get("placeholder", PLACEHOLDERS["MASK"]) ; name = ov.get("name", "override")
|
||
flags_list = ov.get("flags", [])
|
||
try:
|
||
rx = _compile_user_regex(pattern, flags_list)
|
||
except Exception:
|
||
continue
|
||
def _rep(m: re.Match):
|
||
audit.append(PiiHit(page_idx, name, m.group(0), placeholder))
|
||
return placeholder
|
||
line = rx.sub(_rep, line)
|
||
# force-mask literals
|
||
for term in (cfg.get("blacklist", {}).get("force_mask_terms", []) or []):
|
||
if not term: continue
|
||
word_rx = re.compile(rf"\b{re.escape(term)}\b", re.IGNORECASE)
|
||
if word_rx.search(line):
|
||
audit.append(PiiHit(page_idx, "force_term", term, PLACEHOLDERS["MASK"]))
|
||
line = word_rx.sub(PLACEHOLDERS["MASK"], line)
|
||
# force-mask regex
|
||
for pat in (cfg.get("blacklist", {}).get("force_mask_regex", []) or []):
|
||
try:
|
||
rx = re.compile(pat, re.IGNORECASE)
|
||
except Exception:
|
||
continue
|
||
def _repl_force_regex(m: re.Match, _pat=pat):
|
||
audit.append(PiiHit(page_idx, "force_regex", m.group(0), PLACEHOLDERS["MASK"]))
|
||
return PLACEHOLDERS["MASK"]
|
||
line = rx.sub(_repl_force_regex, line)
|
||
return line
|
||
|
||
|
||
RE_BARE_9DIGITS = re.compile(r"\b(\d{9})\b")
|
||
|
||
def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||
m = RE_FINESS.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||
return RE_FINESS.sub(lambda _: f"FINESS : {PLACEHOLDERS['FINESS']}", line)
|
||
|
||
# Détection FINESS par gazetteer : nombre 9 chiffres qui matche un vrai numéro FINESS
|
||
if _FINESS_NUMBERS:
|
||
for m9 in RE_BARE_9DIGITS.finditer(line):
|
||
if m9.group(1) in _FINESS_NUMBERS:
|
||
val = m9.group(1)
|
||
audit.append(PiiHit(page_idx, "FINESS", val, PLACEHOLDERS["FINESS"]))
|
||
line = line.replace(val, PLACEHOLDERS["FINESS"], 1)
|
||
return line
|
||
|
||
m = RE_OGC.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "OGC", val, PLACEHOLDERS["OGC"]))
|
||
return RE_OGC.sub(lambda _: f"N° OGC : {PLACEHOLDERS['OGC']}", line)
|
||
m = RE_IPP.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||
m = RE_CSULT.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
|
||
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
|
||
m = RE_RPPS.search(line)
|
||
if m:
|
||
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
|
||
return RE_RPPS.sub(lambda _: f"RPPS : {PLACEHOLDERS['RPPS']}", line)
|
||
return line
|
||
|
||
|
||
def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||
# user overrides & force-masks d'abord
|
||
line = _apply_overrides(line, audit, page_idx, cfg)
|
||
|
||
# EMAIL
|
||
def _repl_email(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||
return PLACEHOLDERS["EMAIL"]
|
||
line = RE_EMAIL.sub(_repl_email, line)
|
||
|
||
# URLs (toutes — peuvent identifier établissements, personnes, services)
|
||
def _repl_url(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "URL", m.group(0), PLACEHOLDERS["MASK"]))
|
||
return PLACEHOLDERS["MASK"]
|
||
line = RE_URL.sub(_repl_url, line)
|
||
# URL sans protocole (www.xxx.fr)
|
||
_re_url_www = re.compile(r"(?<!\S)www\.[a-z0-9\-]+\.(?:fr|com|org|net|eu)(?:/[^\s]*)?", re.IGNORECASE)
|
||
line = _re_url_www.sub(_repl_url, line)
|
||
|
||
# TEL
|
||
def _repl_tel(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"]
|
||
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
|
||
line = RE_TEL.sub(_repl_tel, line)
|
||
line = RE_TEL_COMPACT.sub(_repl_tel, line)
|
||
|
||
# IBAN
|
||
def _repl_iban(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "IBAN", m.group(0), PLACEHOLDERS["IBAN"]))
|
||
return PLACEHOLDERS["IBAN"]
|
||
line = RE_IBAN.sub(_repl_iban, line)
|
||
|
||
# NIR (avec validation clé modulo 97)
|
||
def _repl_nir(m: re.Match) -> str:
|
||
raw = m.group(0)
|
||
if not validate_nir(raw):
|
||
return raw # faux positif, on ne masque pas
|
||
audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"]))
|
||
return PLACEHOLDERS["NIR"]
|
||
line = RE_NIR.sub(_repl_nir, line)
|
||
|
||
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
|
||
def _repl_date_naissance(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||
return PLACEHOLDERS["DATE_NAISSANCE"]
|
||
line = RE_DATE_NAISSANCE.sub(_repl_date_naissance, line)
|
||
|
||
# DATE générique — désactivé : seules les dates de naissance sont masquées
|
||
# def _repl_date(m: re.Match) -> str:
|
||
# audit.append(PiiHit(page_idx, "DATE", m.group(0), PLACEHOLDERS["DATE"]))
|
||
# return PLACEHOLDERS["DATE"]
|
||
# line = RE_DATE.sub(_repl_date, line)
|
||
|
||
# ADRESSE
|
||
def _repl_adresse(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
line = RE_ADRESSE.sub(_repl_adresse, line)
|
||
|
||
# BOITE POSTALE (BP)
|
||
def _repl_bp(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
line = RE_BP.sub(_repl_bp, line)
|
||
|
||
# CODE_POSTAL
|
||
def _repl_code_postal(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
|
||
return PLACEHOLDERS["CODE_POSTAL"]
|
||
line = RE_CODE_POSTAL.sub(_repl_code_postal, line)
|
||
|
||
# AGE
|
||
def _repl_age(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "AGE", m.group(0), PLACEHOLDERS["AGE"]))
|
||
return PLACEHOLDERS["AGE"]
|
||
line = RE_AGE.sub(_repl_age, line)
|
||
|
||
# NUMERO DOSSIER / NDA
|
||
def _repl_dossier(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "DOSSIER", m.group(0), PLACEHOLDERS["DOSSIER"]))
|
||
return PLACEHOLDERS["DOSSIER"]
|
||
line = RE_NUMERO_DOSSIER.sub(_repl_dossier, line)
|
||
|
||
# N° examen / N° patient imagerie (radiologie)
|
||
def _repl_num_examen(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||
return m.group(0).replace(m.group(1), PLACEHOLDERS["DOSSIER"])
|
||
line = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, line)
|
||
|
||
# Adresses lieu-dit / maison / lotissement (ex: "MAISON ARGAINA 94")
|
||
def _repl_lieu_dit(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
line = RE_ADRESSE_LIEU_DIT.sub(_repl_lieu_dit, line)
|
||
|
||
# Lieux-dits courants seuls sur une ligne (ex: "Le BOURG", "Le Village")
|
||
line = RE_LIEU_DIT_SEUL.sub(
|
||
lambda m: (audit.append(PiiHit(page_idx, "ADRESSE", m.group(1), PLACEHOLDERS["ADRESSE"])) or PLACEHOLDERS["ADRESSE"]),
|
||
line,
|
||
)
|
||
|
||
# N° EPISODE / Episode N. (pieds de page Trackare)
|
||
def _repl_episode(m: re.Match) -> str:
|
||
val = m.group(1) or m.group(2) or m.group(0)
|
||
audit.append(PiiHit(page_idx, "EPISODE", val, PLACEHOLDERS["EPISODE"]))
|
||
# Reconstruire le remplacement en gardant le préfixe et masquant la valeur
|
||
full = m.group(0)
|
||
return full[:full.find(val)] + PLACEHOLDERS["EPISODE"]
|
||
line = RE_EPISODE.sub(_repl_episode, line)
|
||
|
||
# N° venue / N° séjour (BACTERIO, Trackare)
|
||
def _repl_venue(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "NDA", m.group(0), PLACEHOLDERS["NDA"]))
|
||
full = m.group(0)
|
||
val = m.group(1)
|
||
return full[:full.find(val)] + PLACEHOLDERS["NDA"]
|
||
line = RE_VENUE_SEJOUR.sub(_repl_venue, line)
|
||
|
||
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
|
||
def _repl_etab(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"]))
|
||
return PLACEHOLDERS["ETAB"]
|
||
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
||
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
||
|
||
# Établissements par gazetteer Aho-Corasick FINESS (116K noms distinctifs)
|
||
# Note: _mask_finess_establishments() construit l'automate en lazy au premier appel
|
||
line, finess_matched = _mask_finess_establishments(line, return_matched_names=True)
|
||
for matched_name in finess_matched:
|
||
audit.append(PiiHit(page_idx, "ETAB_FINESS", matched_name, PLACEHOLDERS["ETAB"]))
|
||
|
||
# Adresses par gazetteer Aho-Corasick FINESS (28K noms de voie)
|
||
line, addr_matched = _mask_finess_addresses(line, return_matched_names=True)
|
||
for matched_addr in addr_matched:
|
||
audit.append(PiiHit(page_idx, "ADDR_FINESS", matched_addr, PLACEHOLDERS["ADRESSE"]))
|
||
|
||
# Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R D E ..."
|
||
# Les lettres majuscules séparées par des espaces échappent à toute détection normale.
|
||
# Stratégie : si un segment contient un mot-clé d'établissement, masquer TOUTE la ligne
|
||
# espacée (tous les segments contigus) pour éviter de laisser "D E L A C ÔT E B A S Q U E"
|
||
_RE_SPACED_TEXT = re.compile(
|
||
r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]'
|
||
)
|
||
# Pattern plus large : toute la chaîne espacée (lettres séparées par espaces + mots courts)
|
||
_RE_SPACED_FULL_LINE = re.compile(
|
||
r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ](?:\s|ÔT|ÉE)){3,}[\sA-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]*'
|
||
)
|
||
_SPACED_ETAB_KEYWORDS = {
|
||
"HOSPITALIER", "HOSPITALIERE", "HOSPITALIERES", "HOSPITALIERS",
|
||
"CLINIQUE", "HOPITAL", "HÔPITAL", "POLYCLINIQUE",
|
||
"CENTRE", "ETABLISSEMENT", "MAISON", "RESIDENCE",
|
||
"EHPAD", "SSR", "USLD", "CHU", "CHRU",
|
||
}
|
||
spaced_matches = list(_RE_SPACED_TEXT.finditer(line))
|
||
if spaced_matches:
|
||
# Vérifier si au moins un segment contient un mot-clé d'établissement
|
||
has_etab_keyword = False
|
||
for m_sp in spaced_matches:
|
||
collapsed_upper = m_sp.group(0).replace(" ", "").upper()
|
||
if any(kw in collapsed_upper for kw in _SPACED_ETAB_KEYWORDS):
|
||
has_etab_keyword = True
|
||
break
|
||
if has_etab_keyword:
|
||
# Masquer toute la ligne espacée (du début du premier au fin du dernier match)
|
||
full_start = spaced_matches[0].start()
|
||
full_end = spaced_matches[-1].end()
|
||
full_span = line[full_start:full_end]
|
||
audit.append(PiiHit(page_idx, "ETAB_SPACED", full_span, PLACEHOLDERS["ETAB"]))
|
||
line = line[:full_start] + PLACEHOLDERS["ETAB"] + line[full_end:]
|
||
|
||
# Villes par gazetteer Aho-Corasick (INSEE + FINESS)
|
||
if _VILLE_AC is None:
|
||
_build_ville_ac()
|
||
if _VILLE_AC is not None:
|
||
line, ville_originals = _mask_ville_gazetteers(line)
|
||
for vo in ville_originals:
|
||
audit.append(PiiHit(page_idx, "VILLE_GAZ", vo, PLACEHOLDERS["VILLE"]))
|
||
|
||
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
||
def _repl_service(m: re.Match) -> str:
|
||
full_match = m.group(0)
|
||
# Vérifier si c'est un terme structurel à préserver
|
||
if full_match.lower() in _MEDICAL_STRUCTURAL_TERMS:
|
||
return full_match
|
||
# Vérifier le contexte avant (Chef de, Praticien, etc.)
|
||
start_pos = m.start()
|
||
context_before = line[max(0, start_pos-25):start_pos].lower()
|
||
# Patterns à préserver
|
||
preserve_patterns = ['chef de', 'praticien', 'ancien', 'assistant', 'médecin', 'interne']
|
||
if any(pattern in context_before for pattern in preserve_patterns):
|
||
return full_match
|
||
audit.append(PiiHit(page_idx, "ETAB", full_match, PLACEHOLDERS["MASK"]))
|
||
return PLACEHOLDERS["MASK"]
|
||
line = RE_SERVICE.sub(_repl_service, line)
|
||
|
||
# Ville en en-tête de courrier : "Bayonne, le 12/03/2024" → masquer la ville
|
||
# Le contexte "Mot, le [date]" est fiable (virgule obligatoire)
|
||
# Autorise les mots de liaison minuscules (de, du, la, sur, en, lès)
|
||
_re_ville_date = re.compile(
|
||
r"^(\s*)"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç\-]+"
|
||
r"(?:\s+(?:de|du|la|sur|en|lès|les|l['']\s*)?"
|
||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)*)"
|
||
r"(\s*,\s+le\s+\d{1,2})",
|
||
re.MULTILINE,
|
||
)
|
||
def _repl_ville_date(m: re.Match) -> str:
|
||
ville = m.group(2).strip()
|
||
audit.append(PiiHit(page_idx, "VILLE", ville, PLACEHOLDERS["VILLE"]))
|
||
return m.group(1) + PLACEHOLDERS["VILLE"] + m.group(3)
|
||
line = _re_ville_date.sub(_repl_ville_date, line)
|
||
|
||
# Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
|
||
_re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)(\S.+)")
|
||
def _repl_lieu(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"]))
|
||
return m.group(1) + PLACEHOLDERS["VILLE"]
|
||
line = _re_lieu.sub(_repl_lieu, line)
|
||
|
||
_re_ville_res = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
|
||
def _repl_ville_res(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"]))
|
||
return m.group(1) + PLACEHOLDERS["VILLE"]
|
||
line = _re_ville_res.sub(_repl_ville_res, line)
|
||
|
||
# PERSON uppercase avec contexte, whitelist/acronymes courts
|
||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||
|
||
_stop_rx = re.compile(_MEDICAL_STOP_WORDS, re.IGNORECASE)
|
||
|
||
def _clean_name_span(span: str) -> str:
|
||
"""Tronque le span au premier mot médical/stop word."""
|
||
tokens = span.split()
|
||
clean = []
|
||
for t in tokens:
|
||
if _stop_rx.fullmatch(t):
|
||
break
|
||
clean.append(t)
|
||
return " ".join(clean).strip(" .-'")
|
||
|
||
def _repl_person_ctx(m: re.Match) -> str:
|
||
span = m.group(1).strip(); raw = m.group(0)
|
||
if span in wl_sections or raw in wl_phrases: return raw
|
||
# Tronquer avant les mots médicaux
|
||
cleaned = _clean_name_span(span)
|
||
if not cleaned:
|
||
return raw
|
||
tokens = [t for t in cleaned.split() if t]
|
||
if len(tokens) == 1 and len(tokens[0]) <= 4: return raw
|
||
audit.append(PiiHit(page_idx, "NOM", cleaned, PLACEHOLDERS["NOM"]))
|
||
return raw.replace(cleaned, PLACEHOLDERS["NOM"])
|
||
|
||
line = RE_PERSON_CONTEXT.sub(_repl_person_ctx, line)
|
||
|
||
# Mr/Mme + initiale isolée : "Mme Z", "Mr R" → masquer la lettre
|
||
def _repl_civilite_init(m: re.Match) -> str:
|
||
prefix = m.group(1)
|
||
lettre = m.group(2)
|
||
audit.append(PiiHit(page_idx, "NOM", lettre, PLACEHOLDERS["NOM"]))
|
||
return prefix + PLACEHOLDERS["NOM"]
|
||
line = RE_CIVILITE_INITIALE.sub(_repl_civilite_init, line)
|
||
|
||
# Passe supplémentaire : noms dans des listes virgulées après "Dr"
|
||
# ex: "le Dr DUVAL, MACHELART, LAZARO" → masquer chaque nom
|
||
for m in RE_DR_COMMA_LIST.finditer(line):
|
||
fragment = m.group(0)
|
||
# Extraire les segments séparés par des virgules (sauf le premier qui inclut "Dr")
|
||
parts = [p.strip() for p in fragment.split(",")]
|
||
for part in parts:
|
||
# Extraire les tokens nom de chaque segment
|
||
for tok in _NAME_TOKEN_RE.findall(part):
|
||
if tok in wl_sections or len(tok) <= 3:
|
||
continue
|
||
if _stop_rx.fullmatch(tok):
|
||
continue
|
||
if tok not in line:
|
||
continue
|
||
# Vérifier qu'il n'est pas déjà masqué
|
||
if f"[{tok}]" in line or tok in {v for v in PLACEHOLDERS.values()}:
|
||
continue
|
||
audit.append(PiiHit(page_idx, "NOM", tok, PLACEHOLDERS["NOM"]))
|
||
line = re.sub(rf"\b{re.escape(tok)}\b", PLACEHOLDERS["NOM"], line)
|
||
|
||
return line
|
||
|
||
|
||
def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
|
||
"""Masque les TEL, EMAIL, ADRESSE, CODE_POSTAL même dans la partie 'clé' d'une ligne clé:valeur.
|
||
Nécessaire car des lignes comme '13 avenue ... BAYONNE - Tel : 0559' sont splitées sur ':'."""
|
||
def _repl_tel(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"]
|
||
key = RE_TEL_SLASH.sub(_repl_tel, key)
|
||
key = RE_TEL.sub(_repl_tel, key)
|
||
key = RE_TEL_COMPACT.sub(_repl_tel, key)
|
||
def _repl_email(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||
return PLACEHOLDERS["EMAIL"]
|
||
key = RE_EMAIL.sub(_repl_email, key)
|
||
# ADRESSE
|
||
def _repl_adresse(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "ADRESSE", m.group(0), PLACEHOLDERS["ADRESSE"]))
|
||
return PLACEHOLDERS["ADRESSE"]
|
||
key = RE_ADRESSE.sub(_repl_adresse, key)
|
||
# CODE_POSTAL (inclut la ville)
|
||
def _repl_cp(m: re.Match) -> str:
|
||
audit.append(PiiHit(page_idx, "CODE_POSTAL", m.group(0), PLACEHOLDERS["CODE_POSTAL"]))
|
||
return PLACEHOLDERS["CODE_POSTAL"]
|
||
key = RE_CODE_POSTAL.sub(_repl_cp, key)
|
||
# FINESS adresses Aho-Corasick
|
||
key, addr_matched = _mask_finess_addresses(key, return_matched_names=True)
|
||
for matched_addr in addr_matched:
|
||
audit.append(PiiHit(page_idx, "ADDR_FINESS", matched_addr, PLACEHOLDERS["ADRESSE"]))
|
||
return key
|
||
|
||
|
||
def _kv_value_only_mask(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict[str, Any]) -> str:
|
||
line = _mask_admin_label(line, audit, page_idx)
|
||
parts = SPLITTER.split(line, maxsplit=1)
|
||
if len(parts) == 2:
|
||
key, value = parts
|
||
masked_key = _mask_critical_in_key(key, audit, page_idx)
|
||
masked_val = _mask_line_by_regex(value, audit, page_idx, cfg)
|
||
return f"{masked_key.strip()} : {masked_val.strip()}"
|
||
else:
|
||
return _mask_line_by_regex(line, audit, page_idx, cfg)
|
||
|
||
# ----------------- Extraction globale de noms -----------------
|
||
|
||
def _is_trackare_document(text: str) -> bool:
|
||
"""Détecte si le document est un export Trackare/TrakCare (DPI structuré)."""
|
||
markers = ["Détails des patients", "Nom de naissance", "Dossier Patient"]
|
||
t = text[:3000].lower()
|
||
return sum(1 for m in markers if m.lower() in t) >= 2
|
||
|
||
|
||
def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, List[NameCandidate]]:
|
||
"""Parse les champs structurés d'un document Trackare pour extraire les PII.
|
||
Retourne (name_tokens, pii_hits, force_names, candidates) avec les noms à masquer,
|
||
les hits additionnels, le sous-ensemble force_names, et les NameCandidate
|
||
pour la validation croisée NER-first."""
|
||
names: set = set()
|
||
hits: List[PiiHit] = []
|
||
candidates: List[NameCandidate] = []
|
||
|
||
force_names: set = set() # noms issus de contextes structurés (DR., Signé, etc.) → bypass stop words
|
||
|
||
def _add_candidate(token: str, source: str, strength: str, bypass: bool):
|
||
"""Ajoute un NameCandidate à la liste."""
|
||
token = token.strip(" .-'(),")
|
||
if len(token) < 4:
|
||
return
|
||
candidates.append(NameCandidate(
|
||
token=token, source=source,
|
||
context_strength=strength, bypass_stopwords=bypass,
|
||
))
|
||
|
||
def _add_name(s: str, _cand_source: str = "", _cand_strength: str = "medium"):
|
||
s = s.strip()
|
||
parts = s.split()
|
||
for tok in parts:
|
||
tok = tok.strip(" .-'(),")
|
||
if len(tok) >= 4 and tok[0].isupper():
|
||
names.add(tok)
|
||
if _cand_source:
|
||
_add_candidate(tok, _cand_source, _cand_strength, False)
|
||
# Garder aussi le nom composé complet (DI LULLO, LE MOIGNE, etc.)
|
||
if len(parts) >= 2:
|
||
compound = " ".join(t.strip(" .-'(),") for t in parts if len(t.strip(" .-'(),")) >= 2)
|
||
if len(compound) >= 5:
|
||
names.add(compound)
|
||
|
||
# Termes non-noms fréquents dans les contextes Signé/DR./Note d'évolution
|
||
_FORCE_EXCLUDE = _MEDICATION_WHITELIST | {
|
||
"elimination", "élimination", "forte", "intraveineuse", "lavage",
|
||
"sonde", "normal", "réalisé", "realise", "germes", "bbm", "arw",
|
||
"orale", "sachet", "injectable", "comprime", "comprimé", "gelule",
|
||
"gélule", "seringue", "poche", "flacon", "ampoule", "preremplie",
|
||
"préremplie",
|
||
}
|
||
|
||
def _add_name_force(tok: str, _cand_source: str = "", _cand_strength: str = "medium"):
|
||
"""Ajoute un nom depuis un contexte structuré fiable (DR., Signé direct, Note d'évolution).
|
||
Bypass les stop words généraux mais filtre médicaments et termes de soins courants."""
|
||
tok = tok.strip(" .-'(),")
|
||
if len(tok) < 4 or not tok[0].isupper():
|
||
return
|
||
if _cand_source:
|
||
_add_candidate(tok, _cand_source, _cand_strength, True)
|
||
if tok.lower() in _FORCE_EXCLUDE:
|
||
return
|
||
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
return
|
||
names.add(tok)
|
||
force_names.add(tok)
|
||
|
||
# --- Identité patient (high context: DPI structured fields) ---
|
||
# Nom de naissance: DIEGO (peut apparaître 2x : en-tête + récap tabulaire)
|
||
for m in re.finditer(r"Nom\s+de\s+naissance\s*:\s*(.+?)(?:\s+IPP\b|\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip(), "trackare_nom_naissance", "high")
|
||
|
||
# Nom et Prénom: DIEGO PATRICIA
|
||
for m in re.finditer(r"Nom\s+et\s+Pr[ée]nom\s*:\s*(.+?)(?:\s+Date\s+de\s+naissance|\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip(), "trackare_nom_prenom", "high")
|
||
|
||
# Prénom de naissance / Prénom utilisé : REGINA
|
||
for m in re.finditer(r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-']+?)(?:\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip(), "trackare_prenom", "high")
|
||
|
||
# Lieu de naissance: BAYONNE, biarritz, 64102, 99999 → masquer comme VILLE
|
||
for m in re.finditer(r"Lieu\s+de\s+naissance\s*:\s*(\S[^\n]*?)(?:\s*$)", full_text, re.MULTILINE):
|
||
val = m.group(1).strip()
|
||
if val:
|
||
hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"]))
|
||
# Ajouter au set names seulement si alphabétique (pas les codes INSEE numériques)
|
||
if re.match(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç]", val):
|
||
names.add(val)
|
||
|
||
# Ville de résidence: TARNOS → masquer comme VILLE
|
||
for m in re.finditer(r"Ville\s+de\s+r[ée]sidence\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\s\-']+?)(?:\s*$)", full_text, re.MULTILINE):
|
||
val = m.group(1).strip()
|
||
hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"]))
|
||
names.add(val)
|
||
|
||
# Code Postal (toutes occurrences)
|
||
for m in re.finditer(r"[Cc]ode\s*[Pp]ostal\s*:\s*(\d{5})", full_text):
|
||
hits.append(PiiHit(-1, "CODE_POSTAL", m.group(1), PLACEHOLDERS["CODE_POSTAL"]))
|
||
|
||
# N° épisode (= NDA, identifiant de séjour)
|
||
for m in re.finditer(r"Episode\s*N[o°.]?\s*\.?\s*:\s*(\d{5,})", full_text):
|
||
hits.append(PiiHit(-1, "EPISODE", m.group(1), PLACEHOLDERS.get("NDA", "[NDA]")))
|
||
|
||
# RPPS isolés (11 chiffres commençant par 1 ou 2, seul sur une ligne ou en fin de ligne)
|
||
for m in re.finditer(r"^\s*([12]\d{10})\s*$", full_text, re.MULTILINE):
|
||
hits.append(PiiHit(-1, "RPPS", m.group(1), PLACEHOLDERS["RPPS"]))
|
||
|
||
# Adresse patient (toutes les occurrences)
|
||
for m in re.finditer(r"Adresse\s*:\s*(.+?)(?:\s+Ville\s+de\s+r[ée]sidence|\s*$)", full_text, re.MULTILINE):
|
||
val = m.group(1).strip()
|
||
if len(val) > 3:
|
||
hits.append(PiiHit(-1, "ADRESSE", val, PLACEHOLDERS["ADRESSE"]))
|
||
|
||
# --- Pied de page : "Patient : NOM PRENOM - Date de naissance..." (high context) ---
|
||
for m in re.finditer(r"Patient\s*:\s*(.+?)\s*-\s*Date\s+de\s+naissance", full_text):
|
||
_add_name(m.group(1).strip(), "trackare_patient_footer", "high")
|
||
|
||
# --- Médecin courant (toutes occurrences) (medium context) ---
|
||
for m in re.finditer(r"Médecin\s+courant\s*:\s*(?:DR\.?\s*)?(.+?)(?:\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip(), "trackare_medecin_courant", "medium")
|
||
|
||
# --- Médecin traitant (ligne après "Nom Adresse Téléphone") (medium context) ---
|
||
for m in re.finditer(r"Médecin\s+traitant\s*\n.*?Nom\s+Adresse\s+Téléphone\s*\n\s*(?:DR\.?\s*)?(.+?)(?:\d{5}|\s*$)", full_text, re.MULTILINE):
|
||
_add_name(m.group(1).strip(), "trackare_medecin_traitant", "medium")
|
||
|
||
# --- Contacts structurés (medium context) ---
|
||
# Pattern: Relation NOM PRENOM [ADRESSE] [TEL]
|
||
# Accepte les minuscules (Trackare écrit parfois "Conjoint vandestock michele")
|
||
# Capture jusqu'à 3 tokens pour les noms composés (le moigne christophe)
|
||
# Inclut "Personne à prévenir" + relations + Ami/Voisin/Autre
|
||
for m in re.finditer(
|
||
r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur|Ami|Amie|Voisin|Voisine|Autre)\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?",
|
||
full_text,
|
||
):
|
||
contact_parts = [g.strip(" .-'(),") for g in (m.group(1), m.group(2), m.group(3)) if g]
|
||
# Ajouter chaque token >= 4 chars (pas les articles courts comme "le", "di", ni acronymes 3 lettres)
|
||
for tok in contact_parts:
|
||
_add_candidate(tok, "trackare_contact", "medium", False)
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
names.add(tok)
|
||
if tok[0].islower():
|
||
names.add(tok.capitalize())
|
||
# Ajouter aussi le composé complet (pour "le moigne", "di lullo")
|
||
if len(contact_parts) >= 2:
|
||
compound = " ".join(contact_parts)
|
||
if len(compound) >= 5:
|
||
names.add(compound)
|
||
# Version capitalisée pour propagation
|
||
names.add(" ".join(t.capitalize() for t in compound.split()))
|
||
|
||
# --- Personne à prévenir : bloc structuré multi-lignes ---
|
||
# Format Trackare : "Personne à prévenir\nRelation\nNOM\nPrenom" ou
|
||
# "Personne à prévenir\nRelation NOM Prenom\nAdresse..."
|
||
for m in re.finditer(
|
||
r"[Pp]ersonne\s+[àa]\s+pr[ée]venir\s*[:\-]?\s*\n"
|
||
r"(?:[^\n]{0,30}\n){0,2}" # 0-2 lignes intermédiaires (relation, etc.)
|
||
r"\s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?"
|
||
r"(?:\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?",
|
||
full_text,
|
||
):
|
||
for g in (m.group(1), m.group(2), m.group(3)):
|
||
if g:
|
||
tok = g.strip(" .-'(),")
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
names.add(tok)
|
||
if tok[0].islower():
|
||
names.add(tok.capitalize())
|
||
|
||
# --- Prescripteurs / Exécutants (trackare) (medium context) ---
|
||
for m in re.finditer(
|
||
r"(?:Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*"
|
||
r"(?:(?:Dr|Pr)\.?\s+)?"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+))?",
|
||
full_text,
|
||
):
|
||
_add_name(m.group(1), "trackare_prescripteur", "medium")
|
||
if m.group(2):
|
||
_add_name(m.group(2), "trackare_prescripteur", "medium")
|
||
|
||
# --- Médecins urgences (IAO, prise en charge, décision) (medium context) ---
|
||
for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)", full_text):
|
||
_add_name(m.group(1), "trackare_iao", "medium")
|
||
for m in re.finditer(
|
||
r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)"
|
||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text,
|
||
):
|
||
_add_name(m.group(1), "trackare_medecin_urgences", "medium")
|
||
if m.group(2):
|
||
_add_name(m.group(2), "trackare_medecin_urgences", "medium")
|
||
|
||
# --- Noms soignants dans les Notes d'évolution / Notes IDE / Notes médicales (low context) ---
|
||
# Pattern: "Note IDE\nPrenom NOM" ou "Note d'évolution\nPrenom NOM"
|
||
for m in re.finditer(
|
||
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s*\n\s*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][a-zéèàùâêîôûäëïöüç]+)\s+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)",
|
||
full_text
|
||
):
|
||
prenom, nom = m.group(1), m.group(2)
|
||
_add_candidate(prenom, "trackare_note_ide_newline", "low", False)
|
||
_add_candidate(nom, "trackare_note_ide_newline", "low", False)
|
||
if prenom.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(prenom)
|
||
if nom.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(nom)
|
||
|
||
# --- Noms soignants multi-lignes : "Prénom\nNOM" dans les tableaux de prescriptions/soins (low context) ---
|
||
for m in re.finditer(
|
||
r'\b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,})\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛ]{4,})\b',
|
||
full_text
|
||
):
|
||
prenom, nom = m.group(1), m.group(2)
|
||
_add_candidate(prenom, "trackare_prenom_nom_multiline", "low", False)
|
||
_add_candidate(nom, "trackare_prenom_nom_multiline", "low", False)
|
||
if prenom.lower() not in _MEDICAL_STOP_WORDS_SET and nom.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(prenom)
|
||
_add_name(nom)
|
||
|
||
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") (low context) ---
|
||
for m in re.finditer(
|
||
r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
|
||
r"(?:DR\.?[ \t]+)?"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
if len(tok) >= 4:
|
||
_add_name_force(tok, "trackare_note_ide_inline", "low")
|
||
|
||
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") (low context) ---
|
||
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
|
||
for m in re.finditer(
|
||
r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
if len(tok) >= 4:
|
||
_add_name_force(tok, "trackare_signe_direct", "low")
|
||
|
||
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") (low context) ---
|
||
for m in re.finditer(
|
||
r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{3,})"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,}))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
_add_candidate(tok, "trackare_signe_med", "low", False)
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(tok)
|
||
|
||
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") (low context) ---
|
||
for m in re.finditer(
|
||
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{3,})",
|
||
full_text
|
||
):
|
||
tok = m.group(1).rstrip('-')
|
||
_add_candidate(tok, "trackare_flacon", "low", False)
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(tok)
|
||
|
||
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions (medium context) ---
|
||
for m in re.finditer(
|
||
r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,})"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.strip()
|
||
if len(tok) >= 4:
|
||
_add_name_force(tok, "trackare_dr", "medium")
|
||
|
||
# --- Noms soignants après timestamps dans activités de soins (ex: "07:00 ETCHEBARNE") (low context) ---
|
||
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
|
||
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
|
||
for m in re.finditer(
|
||
r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
|
||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,}))?",
|
||
full_text
|
||
):
|
||
for g in (m.group(1), m.group(2)):
|
||
if g:
|
||
tok = g.rstrip('-')
|
||
_add_candidate(tok, "trackare_timestamp", "low", False)
|
||
if len(tok) >= 4 and tok.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
_add_name(tok)
|
||
|
||
# Filtrer les tokens trop courts ou stop words
|
||
# Exceptions : force_names (contextes structurés) et city_tokens (villes extraites)
|
||
city_tokens = {h.original for h in hits if h.kind == "VILLE"}
|
||
filtered = set()
|
||
for tok in names:
|
||
if tok in city_tokens or tok in force_names:
|
||
filtered.add(tok)
|
||
continue
|
||
if len(tok) < 4:
|
||
continue
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
filtered.add(tok)
|
||
|
||
return filtered, hits, force_names, candidates
|
||
|
||
|
||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set, List[NameCandidate]]:
|
||
"""Pré-scan du document brut pour extraire les noms de personnes
|
||
depuis les champs structurés (Patient, Rédigé par, etc.).
|
||
Retourne (names, force_names, candidates) : ensemble de tokens à masquer,
|
||
sous-ensemble qui bypass les stop words, et liste de NameCandidate
|
||
avec métadonnées de confiance pour la validation croisée NER-first."""
|
||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||
names: set = set()
|
||
force_names: set = set()
|
||
candidates: List[NameCandidate] = []
|
||
|
||
def _add_candidate(token: str, source: str, strength: str, bypass: bool):
|
||
"""Ajoute un NameCandidate à la liste (dédupliqué par token+source)."""
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
return
|
||
candidates.append(NameCandidate(
|
||
token=token, source=source,
|
||
context_strength=strength, bypass_stopwords=bypass,
|
||
))
|
||
|
||
def _add_compound(match_str: str):
|
||
"""Ajoute le nom composé complet en plus des tokens individuels (DI LULLO, LE MOIGNE)."""
|
||
parts = [t.strip(" .-'") for t in match_str.split() if len(t.strip(" .-'")) >= 2]
|
||
if len(parts) >= 2:
|
||
compound = " ".join(parts)
|
||
if len(compound) >= 5:
|
||
names.add(compound)
|
||
|
||
def _add_tokens(match_str: str, _cand_source: str = "", _cand_strength: str = "medium"):
|
||
_add_compound(match_str)
|
||
for token in match_str.split():
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
continue
|
||
if token.upper() in wl_sections or token in wl_phrases:
|
||
continue
|
||
if _cand_source:
|
||
_add_candidate(token, _cand_source, _cand_strength, False)
|
||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
names.add(token)
|
||
|
||
def _add_tokens_force_all(match_str: str, _cand_source: str = "", _cand_strength: str = "high"):
|
||
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
|
||
_add_compound(match_str)
|
||
for token in match_str.split():
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
continue
|
||
if token.upper() in wl_sections or token in wl_phrases:
|
||
continue
|
||
if _cand_source:
|
||
_add_candidate(token, _cand_source, _cand_strength, True)
|
||
names.add(token)
|
||
force_names.add(token)
|
||
|
||
def _add_tokens_force_first(match_str, _cand_source: str = "", _cand_strength: str = "medium"):
|
||
"""Comme _add_tokens mais force TOUS les tokens (contexte Dr/Mme fort).
|
||
|
||
Après Dr/Mme, tous les tokens sont des noms — même s'ils sont
|
||
homonymes de termes médicaux (ex: Dr Laurence MASSE).
|
||
"""
|
||
_add_compound(match_str)
|
||
tokens = match_str.split()
|
||
for token in tokens:
|
||
token = token.strip(" .-'")
|
||
if len(token) < 4:
|
||
continue
|
||
if token.upper() in wl_sections or token in wl_phrases:
|
||
continue
|
||
if _cand_source:
|
||
_add_candidate(token, _cand_source, _cand_strength, True)
|
||
names.add(token)
|
||
force_names.add(token)
|
||
|
||
# --- high context: DPI structured fields (near-certain patient identity) ---
|
||
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1), "RE_EXTRACT_PATIENT", "high")
|
||
# Champs d'identité structurés (trackare / DPI)
|
||
for m in RE_EXTRACT_NOM_NAISSANCE.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1), "RE_EXTRACT_NOM_NAISSANCE", "high")
|
||
for m in RE_EXTRACT_NOM_UTILISE.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1), "RE_EXTRACT_NOM_UTILISE", "high")
|
||
for m in RE_EXTRACT_NOM_PRENOM.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1), "RE_EXTRACT_NOM_PRENOM", "high")
|
||
for m in RE_EXTRACT_PRENOM.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1), "RE_EXTRACT_PRENOM", "high")
|
||
# En-tête "Courrier Epi - NOM, PRENOM" (lettres de sortie)
|
||
for m in RE_EXTRACT_COURRIER.finditer(full_text):
|
||
for part in m.group(1).split(","):
|
||
part = part.strip()
|
||
if part:
|
||
_add_tokens_force_all(part, "RE_EXTRACT_COURRIER", "high")
|
||
|
||
# --- medium context: medical titles (Dr, Mme, Pr, Opérateur, etc.) ---
|
||
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
||
_add_tokens(m.group(1), "RE_EXTRACT_REDIGE", "medium")
|
||
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1), "RE_EXTRACT_MME_MR", "medium")
|
||
for m in RE_EXTRACT_DR_DEST.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1), "RE_EXTRACT_DR_DEST", "medium")
|
||
for m in RE_EXTRACT_LIEU_NAISSANCE.finditer(full_text):
|
||
_add_tokens(m.group(1), "RE_EXTRACT_LIEU_NAISSANCE", "medium")
|
||
for m in RE_EXTRACT_VILLE_RESIDENCE.finditer(full_text):
|
||
_add_tokens(m.group(1), "RE_EXTRACT_VILLE_RESIDENCE", "medium")
|
||
# Contacts structurés (conjoint, concubin, etc.)
|
||
for m in RE_EXTRACT_CONTACT.finditer(full_text):
|
||
_add_tokens(m.group(1), "RE_EXTRACT_CONTACT", "medium")
|
||
if m.group(2):
|
||
_add_tokens(m.group(2), "RE_EXTRACT_CONTACT", "medium")
|
||
# Personnel médical avec rôle (Aide, Cadre Infirmier, Prescripteur, etc.)
|
||
for m in RE_EXTRACT_STAFF_ROLE.finditer(full_text):
|
||
_add_tokens(m.group(1), "RE_EXTRACT_STAFF_ROLE", "medium")
|
||
# Pr / Professeur + nom(s)
|
||
for m in RE_EXTRACT_PR.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1), "RE_EXTRACT_PR", "medium")
|
||
# Opérateur / Anesthésiste / Chirurgien + nom(s)
|
||
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
|
||
_add_tokens_force_first(m.group(1), "RE_EXTRACT_OPERATEUR", "medium")
|
||
# Nom de cabinet (ex: "CABINET ETXEBARNONDOA")
|
||
for m in RE_EXTRACT_CABINET.finditer(full_text):
|
||
_add_tokens(m.group(1), "RE_EXTRACT_CABINET", "medium")
|
||
|
||
# Extraction des noms dans les listes virgulées après Dr/Docteur ou Mmes/Mme
|
||
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
|
||
# ex: "Mmes Anorga, Goyenaga, Martinez et Murcy"
|
||
for m_comma in RE_DR_COMMA_LIST.finditer(full_text):
|
||
fragment = m_comma.group(0)
|
||
parts = [p.strip() for p in fragment.split(",")]
|
||
for part in parts:
|
||
for tok in _NAME_TOKEN_RE.findall(part):
|
||
tok = tok.strip(" .-'")
|
||
if len(tok) < 4:
|
||
continue
|
||
if tok.upper() in wl_sections or tok in wl_phrases:
|
||
continue
|
||
_add_candidate(tok, "RE_DR_COMMA_LIST", "medium", False)
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
names.add(tok)
|
||
_CIVILITE_PREFIX_RE = re.compile(
|
||
r"^\s*(?:Mmes|Mme|Mesdames|Mrs|Mr\.?|Messieurs|Monsieur|Madame|Dr\.?|Docteur)\s+",
|
||
re.IGNORECASE,
|
||
)
|
||
for m_comma in RE_CIVILITE_COMMA_LIST.finditer(full_text):
|
||
fragment = m_comma.group(0)
|
||
# Retirer le préfixe civilité (Mmes, Dr, etc.) de la première partie
|
||
fragment = _CIVILITE_PREFIX_RE.sub("", fragment)
|
||
parts = [p.strip() for p in fragment.split(",")]
|
||
for part in parts:
|
||
# Nettoyer "et " en début de token
|
||
part = re.sub(r"^\s*et\s+", "", part)
|
||
for tok in _NAME_TOKEN_RE.findall(part):
|
||
tok = tok.strip(" .-'")
|
||
if len(tok) < 4:
|
||
continue
|
||
if tok.upper() in wl_sections or tok in wl_phrases:
|
||
continue
|
||
_add_candidate(tok, "RE_CIVILITE_COMMA_LIST", "medium", False)
|
||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
names.add(tok)
|
||
|
||
# Dr X.NOM / Pr X.NOM : initiale collée au nom (ex: "Dr E.ELLIE", "Pr J.DUPONT")
|
||
_RE_DR_INITIAL_DOT_NAME = re.compile(
|
||
r"\b(?:Dr\.?|Docteur|Pr\.?|Professeur)[ \t]+"
|
||
r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]{2,})"
|
||
)
|
||
for m in _RE_DR_INITIAL_DOT_NAME.finditer(full_text):
|
||
names.add(m.group(2)) # Le nom (ELLIE)
|
||
force_names.add(m.group(2))
|
||
_add_candidate(m.group(2), "DR_INITIAL_DOT_NAME", "medium", True)
|
||
# Ajouter aussi "X.NOM" complet pour le raster (token collé)
|
||
names.add(f"{m.group(1)}.{m.group(2)}")
|
||
force_names.add(f"{m.group(1)}.{m.group(2)}")
|
||
|
||
# En-têtes email : "De : Prénom NOM <email>", "À : Prénom NOM <email>"
|
||
# Gère \xa0 (non-breaking space) et absence d'espace après ":"
|
||
_RE_EMAIL_HEADER = re.compile(
|
||
r"(?:De|From|À|A|To|Cc|Cci|Bcc)[\s\xa0]*:[\s\xa0]*"
|
||
r"([A-ZÀ-ÖØ-Ý][a-zà-öø-ÿ]+(?:[\s\xa0]+[A-ZÀ-ÖØ-Ý][A-ZÀ-ÖØ-Ýa-zà-öø-ÿ\-]+)+)"
|
||
r"[\s\xa0]*<[^>]+>",
|
||
re.MULTILINE
|
||
)
|
||
for m in _RE_EMAIL_HEADER.finditer(full_text):
|
||
_add_tokens_force_all(m.group(1), "EMAIL_HEADER", "medium")
|
||
|
||
# Pour les noms composés avec tiret (ex: "LACLAU-LACROUTS"),
|
||
# ajouter aussi les parties individuelles pour capturer les occurrences standalone.
|
||
# _apply_extracted_names traite le composé en premier (plus long) puis les parties.
|
||
# Les parties sont forcées (bypass stop words) car le composé lui-même est un nom
|
||
# confirmé — ex: "BILLON-GRAND" → "GRAND" doit être masqué même si "grand" est
|
||
# un mot courant, car c'est un composant d'un nom de personne détecté.
|
||
compound_names = {n for n in names if "-" in n}
|
||
for compound in compound_names:
|
||
for part in compound.split("-"):
|
||
part = part.strip()
|
||
if len(part) >= 3:
|
||
names.add(part)
|
||
force_names.add(part)
|
||
|
||
return names, force_names, candidates
|
||
|
||
|
||
def _cross_validate_name_candidates(
|
||
candidates: List[NameCandidate],
|
||
ner_detections: List["NerDetection"],
|
||
insee_noms: set,
|
||
insee_prenoms: set,
|
||
medical_stopwords: set,
|
||
) -> Tuple[set, set]:
|
||
"""Valide les candidats de noms extraits par regex en croisant avec les détections NER
|
||
et les gazetteers INSEE.
|
||
|
||
Matrice de décision :
|
||
- high context → toujours accepter (champs DPI structurés, quasi-certains)
|
||
- medium context + NER confirmé → accepter
|
||
- medium context + non NER + INSEE + non stopword → accepter
|
||
- medium context + non NER + non INSEE + stopword → REJETER
|
||
- medium context + non NER + non INSEE + non stopword → accepter (bénéfice du doute)
|
||
- low context + NER confirmé → accepter
|
||
- low context + non NER + INSEE + non stopword → accepter
|
||
- low context + non NER + stopword → REJETER
|
||
- low context + non NER + non INSEE → REJETER
|
||
|
||
Un nom est "NER confirmé" si une NerDetection a un token correspondant (case-insensitive).
|
||
|
||
Returns:
|
||
(validated_names, validated_force_names) : ensembles de tokens validés.
|
||
"""
|
||
# Construire le set de tokens confirmés par NER (uppercase sans accents, pour matching)
|
||
ner_confirmed_tokens: set = set()
|
||
for det in ner_detections:
|
||
ner_confirmed_tokens.add(_normalize_nfkd_upper(det.token))
|
||
|
||
validated_names: set = set()
|
||
validated_force_names: set = set()
|
||
|
||
for cand in candidates:
|
||
tok = cand.token
|
||
tok_upper = _normalize_nfkd_upper(tok)
|
||
tok_lower = tok.lower()
|
||
|
||
is_ner_confirmed = tok_upper in ner_confirmed_tokens
|
||
is_in_insee = tok_upper in insee_noms or tok_upper in insee_prenoms
|
||
is_stopword = tok_lower in medical_stopwords
|
||
|
||
strength = cand.context_strength
|
||
|
||
accepted = False
|
||
|
||
if strength == "high":
|
||
# Toujours accepter les champs DPI structurés
|
||
accepted = True
|
||
elif cand.bypass_stopwords:
|
||
# Force names (contexte Dr/Mme confirmé) → toujours accepter
|
||
# même si c'est un stop-word (ex: Dr MASSE, Dr GRAND)
|
||
accepted = True
|
||
elif strength == "medium":
|
||
if is_ner_confirmed:
|
||
accepted = True
|
||
elif is_in_insee and not is_stopword:
|
||
accepted = True
|
||
elif not is_in_insee and is_stopword:
|
||
accepted = False # REJETER
|
||
else:
|
||
# non NER + non INSEE + non stopword → bénéfice du doute
|
||
accepted = True
|
||
elif strength == "low":
|
||
if is_ner_confirmed:
|
||
accepted = True
|
||
elif is_in_insee and not is_stopword:
|
||
accepted = True
|
||
elif is_stopword:
|
||
accepted = False # REJETER
|
||
elif not is_in_insee:
|
||
accepted = False # REJETER
|
||
else:
|
||
accepted = False
|
||
|
||
if accepted:
|
||
validated_names.add(tok)
|
||
if cand.bypass_stopwords:
|
||
validated_force_names.add(tok)
|
||
|
||
return validated_names, validated_force_names
|
||
|
||
|
||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
|
||
"""Remplace globalement chaque nom extrait dans le texte."""
|
||
placeholder = PLACEHOLDERS["NOM"]
|
||
_force = force_names or set()
|
||
safe_names = set()
|
||
for n in names:
|
||
if len(n) < 4 and n not in _force:
|
||
# Tokens < 4 chars : accepter SEULEMENT les force_names (ex: "Ute" après Dr)
|
||
continue
|
||
if n.lower() in _DPI_LABELS_SET:
|
||
continue
|
||
# "Saint"/"SAINT" seul = bloquer. "Saint-Germes" composé = laisser passer
|
||
if n.upper() in ("SAINT", "SAINTE") and "-" not in n:
|
||
continue
|
||
if n not in _force and n.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
# Whitelist absolue (configurée par l'utilisateur via la GUI) : prime sur tout,
|
||
# y compris sur les force_names (Dr/Mme). Si l'établissement déclare un terme
|
||
# comme "à ne jamais masquer", on respecte même s'il apparaît après "Dr".
|
||
if n.lower() in _WHITELIST_NEVER_MASK_TOKENS:
|
||
continue
|
||
safe_names.add(n)
|
||
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
|
||
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
|
||
# Les noms forcés (contexte Dr/Mme) utilisent NOM_FORCE pour bypasser
|
||
# le filtre stop-words dans le raster
|
||
for token in sorted(safe_names, key=len, reverse=True):
|
||
kind = "NOM_FORCE" if token in _force else "NOM_GLOBAL"
|
||
audit.append(PiiHit(-1, kind, token, placeholder))
|
||
for token in sorted(safe_names, key=len, reverse=True):
|
||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||
new_text = []
|
||
last_end = 0
|
||
for m in pattern.finditer(text):
|
||
# Ne pas remplacer si déjà dans un placeholder
|
||
ctx_start = max(0, m.start() - 1)
|
||
ctx_end = min(len(text), m.end() + 1)
|
||
if "[" in text[ctx_start:m.start()] or "]" in text[m.end():ctx_end]:
|
||
continue
|
||
# Ne pas remplacer si le token fait partie d'un mot composé (tiret + lettre)
|
||
# Ex: "NOCENT-EJNAINI" → ne pas remplacer NOCENT seul
|
||
# Mais "LACLAU-" (tiret de troncature) → remplacer
|
||
# Gère aussi le cas cross-line : "BILLON-\nGRAND" (nom intact)
|
||
# mais pas "[NOM]-\nGRAND" (déjà partiellement masqué → on remplace)
|
||
if m.start() > 0 and text[m.start() - 1] == "-":
|
||
if m.start() >= 2 and text[m.start() - 2].isalpha():
|
||
continue
|
||
# Cross-line: "\n" juste avant, tiret avant le "\n", lettre avant le tiret
|
||
if m.start() > 1 and text[m.start() - 1] == "\n" and text[m.start() - 2] == "-":
|
||
pre_pos = m.start() - 3
|
||
if pre_pos >= 0 and text[pre_pos].isalpha():
|
||
continue # Composé intact (BILLON-\nGRAND) → skip
|
||
# Si le tiret est après un placeholder ([NOM]-\nGRAND) → on remplace
|
||
if m.end() < len(text) and text[m.end()] == "-":
|
||
if m.end() + 1 < len(text) and text[m.end() + 1].isalpha():
|
||
continue
|
||
# DÉSACTIVÉ: NOM_EXTRACTED génère 3,846 FP (77.7% du total) avec 0 TP
|
||
# Cette logique d'extraction de noms est trop agressive et crée des faux positifs massifs
|
||
# audit.append(PiiHit(-1, "NOM_EXTRACTED", m.group(0), placeholder))
|
||
new_text.append(text[last_end:m.start()])
|
||
new_text.append(placeholder)
|
||
last_end = m.end()
|
||
new_text.append(text[last_end:])
|
||
text = "".join(new_text)
|
||
return text
|
||
|
||
|
||
def _apply_trackare_hits_to_text(text: str, audit: List[PiiHit]) -> str:
|
||
"""Applique les PiiHit non-NOM dans le texte (NDA footers, EPISODE, RPPS, FINESS, etc.).
|
||
Ces hits sont détectés par _extract_trackare_identity ou la phase 0c
|
||
mais n'étaient appliqués qu'au PDF raster, pas au fichier .pseudonymise.txt."""
|
||
_APPLY_KINDS = {"EPISODE", "RPPS", "FINESS"}
|
||
# Collecter les valeurs à remplacer, groupées par placeholder
|
||
replacements: Dict[str, str] = {} # original → placeholder
|
||
for h in audit:
|
||
if h.kind in _APPLY_KINDS and h.original and len(h.original.strip()) >= 4:
|
||
replacements[h.original.strip()] = h.placeholder
|
||
# Remplacer les plus longs d'abord (éviter les remplacements partiels)
|
||
for original in sorted(replacements, key=len, reverse=True):
|
||
placeholder = replacements[original]
|
||
escaped = re.escape(original)
|
||
# Word boundary pour ne pas casser les mots (ex: ONDANSETRON)
|
||
text = re.sub(rf"\b{escaped}\b", placeholder, text)
|
||
# Aussi gérer les formats avec astérisques (*640000162*)
|
||
text = re.sub(rf"\*{escaped}\*", placeholder, text)
|
||
return text
|
||
|
||
|
||
# ----------------- Anonymisation (regex) -----------------
|
||
|
||
def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]], cfg: Dict[str, Any],
|
||
eds_pseudo_mgr=None, gliner_mgr=None, camembert_mgr=None) -> AnonResult:
|
||
audit: List[PiiHit] = []
|
||
|
||
# Phase 0 : extraction globale des noms depuis les champs structurés
|
||
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||
"\n".join(rows) for rows in tables_lines
|
||
)
|
||
extracted_names, doc_force_names, doc_candidates = _extract_document_names(full_raw, cfg)
|
||
|
||
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
||
is_trackare = _is_trackare_document(full_raw)
|
||
trackare_force_names: set = set()
|
||
all_candidates: List[NameCandidate] = list(doc_candidates)
|
||
if is_trackare:
|
||
trackare_names, trackare_hits, trackare_force_names, trackare_candidates = _extract_trackare_identity(full_raw)
|
||
extracted_names.update(trackare_names)
|
||
audit.extend(trackare_hits)
|
||
all_candidates.extend(trackare_candidates)
|
||
|
||
# --- NER-first : validation croisée des noms extraits par regex ---
|
||
# Exécuter NER sur le texte original (non masqué) si un moteur NER est disponible
|
||
ner_detections: List[NerDetection] = []
|
||
if eds_pseudo_mgr or gliner_mgr or camembert_mgr:
|
||
ner_detections = _run_ner_on_original_text(
|
||
pages_text, eds_pseudo_mgr, gliner_mgr, camembert_mgr, cfg
|
||
)
|
||
|
||
# Valider les candidats par croisement NER + INSEE
|
||
if ner_detections or _INSEE_NOMS_FAMILLE:
|
||
validated_names, validated_force = _cross_validate_name_candidates(
|
||
all_candidates, ner_detections, _INSEE_NOMS_FAMILLE, _INSEE_PRENOMS_SET,
|
||
_MEDICAL_STOP_WORDS_SET
|
||
)
|
||
# Utiliser les noms validés
|
||
all_names = validated_names
|
||
all_force_names = validated_force
|
||
|
||
# Ajouter les détections NER-only (noms trouvés par NER mais pas par regex)
|
||
for det in ner_detections:
|
||
if det.label in ("NOM", "PRENOM") and len(det.token) >= 4:
|
||
if det.token.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||
all_names.add(det.token)
|
||
|
||
log.info("NER-first cross-validation: %d noms validés (dont %d force), "
|
||
"%d NER-only ajoutés (sur %d détections NER)",
|
||
len(validated_names), len(validated_force),
|
||
len(all_names) - len(validated_names), len(ner_detections))
|
||
else:
|
||
# Pas de NER, pas d'INSEE → fallback comportement original
|
||
all_names = extracted_names
|
||
all_force_names = doc_force_names | trackare_force_names
|
||
|
||
# Phase 0c-url : détection et masquage des URLs (y compris coupées par saut de ligne)
|
||
# Ex: "https://courrier\n.avenir-numerique.fr/owa/#path=/mail/inbox"
|
||
_RE_URL_MULTILINE = re.compile(
|
||
r"(https?://\S+)\n(\.[a-zA-Z0-9\-]+\.[a-zA-Z]{2,}(?:/[^\s]*)?)",
|
||
re.MULTILINE
|
||
)
|
||
for m in _RE_URL_MULTILINE.finditer(full_raw):
|
||
full_url = m.group(1) + m.group(2)
|
||
audit.append(PiiHit(-1, "URL", full_url, PLACEHOLDERS["MASK"]))
|
||
# Aussi masquer les deux parties séparément (pour le matching ligne par ligne)
|
||
audit.append(PiiHit(-1, "URL", m.group(1), PLACEHOLDERS["MASK"]))
|
||
audit.append(PiiHit(-1, "URL", m.group(2).lstrip("."), PLACEHOLDERS["MASK"]))
|
||
|
||
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
|
||
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
|
||
_RE_FINESS_MULTILINE = re.compile(
|
||
r"(?:N°\s*)?[Ff]iness?\s*\n(?:[^\n]*\n){0,2}\s*\*?(\d{9})\*?", re.MULTILINE
|
||
)
|
||
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
|
||
|
||
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
|
||
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
|
||
# Variante large : tolère 0-3 lignes intermédiaires (tableaux BACTERIO)
|
||
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
|
||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n"
|
||
r"(?:[^\n]*\n){0,3}\s*"
|
||
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||
|
||
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
|
||
_RE_IPP_MULTILINE = re.compile(
|
||
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_IPP_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
|
||
|
||
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
|
||
_RE_DEMANDE_MULTILINE = re.compile(
|
||
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||
|
||
# Phase 0g : N° venue multiline (tableaux BACTERIO : label et valeur séparés)
|
||
_RE_VENUE_MULTILINE = re.compile(
|
||
r"N[°o]?\s*venue\s*[:\-]?\s*\n(?:[^\n]*\n){0,2}\s*(\d{6,})",
|
||
re.IGNORECASE,
|
||
)
|
||
for m in _RE_VENUE_MULTILINE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||
# Phase 0g-bis : N° venue inversé (layout-aware réordonne : numéro AVANT label)
|
||
_RE_VENUE_REVERSE = re.compile(
|
||
r"(?<!\d)(\d{7,10})(?!\d)\s*\n(?:[^\n]*\n){0,4}N[°o]?\s*venue\s*[:\-]?\s*$",
|
||
re.IGNORECASE | re.MULTILINE,
|
||
)
|
||
for m in _RE_VENUE_REVERSE.finditer(full_raw):
|
||
audit.append(PiiHit(-1, "NDA", m.group(1), PLACEHOLDERS["NDA"]))
|
||
|
||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||
out_pages: List[str] = []
|
||
for i, page_txt in enumerate(pages_text):
|
||
lines = [ln for ln in (page_txt or "").splitlines()]
|
||
masked = [_kv_value_only_mask(ln, audit, i, cfg) for ln in lines]
|
||
out_pages.append("\n".join(masked))
|
||
table_blocks: List[str] = []
|
||
for i, rows in enumerate(tables_lines):
|
||
mbuf: List[str] = []
|
||
for r in rows:
|
||
masked = _kv_value_only_mask(r, audit, i, cfg)
|
||
mbuf.append(masked)
|
||
if mbuf:
|
||
table_blocks.append("\n".join(mbuf))
|
||
tables_block = "\n\n".join(table_blocks)
|
||
text_out = "\f".join(out_pages) # séparateur de pages
|
||
# NOTE: on n'ajoute PAS le bloc [TABLES] au text_out.
|
||
# pdfplumber extrait souvent le contenu principal comme "table", créant un doublon
|
||
# intégral du texte. Ce doublon échappait au NER et au rescan (protégé par les
|
||
# marqueurs [TABLES]), et le NER EDS-pseudo corrompait les marqueurs en changeant
|
||
# la longueur du texte → fuite PII massive (dates de naissance, adresses, noms).
|
||
# Les PII détectés dans les tables sont toujours dans l'audit (Phase 1 regex).
|
||
|
||
# Phase 2 : application globale des noms extraits (rattrapage)
|
||
# Utilise all_names (validé par NER-first si disponible, sinon extracted_names original)
|
||
if all_names:
|
||
text_out = _apply_extracted_names(text_out, all_names, audit, force_names=all_force_names)
|
||
|
||
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
|
||
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
||
|
||
return AnonResult(text_out=text_out, tables_block=tables_block, audit=audit, is_trackare=is_trackare)
|
||
|
||
# ----------------- NER ONNX sur narratif -----------------
|
||
|
||
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||
# remplace via regex sur les 'word' détectés (approche pragmatique)
|
||
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False))
|
||
def repl_once(s: str, old: str, new: str) -> str:
|
||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||
out = text
|
||
for e in ents:
|
||
w = e.get("word") or ""; grp = (e.get("entity_group") or e.get("entity") or "").upper()
|
||
if not w or "[" in w or "]" in w: # ignore placeholders
|
||
continue
|
||
if len(w) <= 2: # trop court
|
||
continue
|
||
if grp in {"PER", "PERSON"}:
|
||
audit.append(PiiHit(-1, "NER_PER", w, PLACEHOLDERS["NOM"]))
|
||
out = repl_once(out, w, PLACEHOLDERS["NOM"])
|
||
elif grp in {"ORG"}:
|
||
if keep_org_gpe:
|
||
continue
|
||
audit.append(PiiHit(-1, "NER_ORG", w, PLACEHOLDERS["ETAB"]))
|
||
out = repl_once(out, w, PLACEHOLDERS["ETAB"])
|
||
elif grp in {"LOC"}:
|
||
if keep_org_gpe:
|
||
continue
|
||
audit.append(PiiHit(-1, "NER_LOC", w, PLACEHOLDERS["VILLE"]))
|
||
out = repl_once(out, w, PLACEHOLDERS["VILLE"])
|
||
elif grp in {"DATE"}:
|
||
# facultatif : si vous masquez déjà les dates via règles, laissez tel quel
|
||
continue
|
||
return out
|
||
|
||
|
||
def apply_hf_ner_on_narrative(text_out: str, cfg: Dict[str, Any], manager: Optional[NerModelManager], thresholds: Optional[NerThresholds]) -> Tuple[str, List[PiiHit]]:
|
||
if manager is None or not manager.is_loaded():
|
||
return text_out, []
|
||
# isoler [TABLES]
|
||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||
tables: List[Tuple[int,int,str]] = []
|
||
keep = []
|
||
last = 0
|
||
cleaned = ""
|
||
for m in pattern.finditer(text_out):
|
||
cleaned += text_out[last:m.start()]
|
||
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||
cleaned += "\x00" * len(m.group(0))
|
||
last = m.end()
|
||
cleaned += text_out[last:]
|
||
|
||
# par pages (séparées par \f) → par paragraphes
|
||
pages = cleaned.split("\f")
|
||
hits: List[PiiHit] = []
|
||
rebuilt_pages: List[str] = []
|
||
for pg in pages:
|
||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||
ents_per_para = manager.infer_paragraphs(paras, thresholds=thresholds)
|
||
# remplace entités
|
||
idx = 0
|
||
buf = []
|
||
for para, ents in zip(paras, ents_per_para):
|
||
masked = _mask_with_hf(para, ents, cfg, hits)
|
||
buf.append(masked)
|
||
rebuilt_pages.append("\n\n".join(buf))
|
||
rebuilt = "\f".join(rebuilt_pages)
|
||
|
||
# réinsérer [TABLES]
|
||
rebuilt_list = list(rebuilt)
|
||
for start, end, payload in keep:
|
||
rebuilt_list[start:end] = list(payload)
|
||
final = "".join(rebuilt_list)
|
||
return final, hits
|
||
|
||
# ----------------- NER EDS-Pseudo sur narratif -----------------
|
||
|
||
def _mask_with_eds_pseudo(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||
"""Masque les entités détectées par EDS-Pseudo en utilisant le mapping eds_mapped_key."""
|
||
def repl_once(s: str, old: str, new: str) -> str:
|
||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||
out = text
|
||
for e in ents:
|
||
w = e.get("word") or ""
|
||
mapped_key = e.get("eds_mapped_key", "")
|
||
if not w or "[" in w or "]" in w:
|
||
continue
|
||
if len(w) <= 2:
|
||
continue
|
||
# Filtrer les faux positifs NOM/PRENOM (médicaments, acronymes médicaux)
|
||
label = e.get("entity_group", "EDS")
|
||
if label in ("NOM", "PRENOM", "HOPITAL", "VILLE"):
|
||
if w.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
# Filtrer aussi les tokens multi-mots dont un composant est un stop word
|
||
if " " in w and any(part.lower() in _MEDICAL_STOP_WORDS_SET for part in w.split()):
|
||
continue
|
||
# Filtrer les dosages détectés comme noms (ex: "10MG", "300UI", "1 000")
|
||
if re.match(r"^\d[\d\s]*(?:mg|MG|ml|ML|UI|µg|mcg|g|kg|%)?$", w.strip()):
|
||
continue
|
||
# CORRECTION 1.2: Filtrer les médicaments détectés comme NOM/PRENOM
|
||
if label in ("NOM", "PRENOM"):
|
||
# Vérifier si c'est un médicament connu
|
||
if w.lower() in _MEDICATION_WHITELIST:
|
||
continue
|
||
# Chantier 3+4+5 : Confiance NER + vote croisé GLiNER + CamemBERT-bio + gazetteers INSEE
|
||
# Sécurité d'abord : haute confiance NER → toujours masquer
|
||
# GLiNER/CamemBERT peuvent rejeter SEULEMENT si confiance NER basse
|
||
gliner_vote = e.get("gliner_confirmed") # True=PII, False=médical, None=neutre
|
||
camembert_vote = e.get("camembert_confirmed") # True=PII confirmé, False=non détecté, None=neutre
|
||
if label in ("NOM", "PRENOM"):
|
||
score = e.get("score", 1.0)
|
||
# Gazetteer INSEE : prénom connu = renforcement confiance (ne pas filtrer)
|
||
is_known_prenom = w.lower() in _INSEE_PRENOMS
|
||
if isinstance(score, float) and score < 0.70 and not is_known_prenom:
|
||
# Basse confiance NER + pas un prénom connu
|
||
if gliner_vote is False and camembert_vote is not True:
|
||
continue # GLiNER dit "médical" + CamemBERT ne confirme pas → skip
|
||
if score < 0.30 and camembert_vote is not True:
|
||
continue # Très basse confiance + CamemBERT ne confirme pas → skip
|
||
# Chantier 2 : Safe patterns contextuels (Philter-style)
|
||
# Token suivi/précédé de dosages ou formes pharma → jamais un nom de personne
|
||
pos = text.find(w)
|
||
if pos >= 0:
|
||
# Contexte MÊME LIGNE seulement ([ \t] pas \n)
|
||
line_start = text.rfind('\n', 0, pos)
|
||
line_start = 0 if line_start < 0 else line_start + 1
|
||
line_end = text.find('\n', pos + len(w))
|
||
line_end = len(text) if line_end < 0 else line_end
|
||
ctx_before = text[max(line_start, pos - 30):pos]
|
||
ctx_after = text[pos + len(w):min(line_end, pos + len(w) + 30)]
|
||
# Safe pattern: précédé ou suivi d'un dosage (mg, mL, UI, comprimé, etc.)
|
||
_RE_DOSAGE = r"\d+[ \t]*(?:mg|ml|ui|µg|mcg|g|kg|cp|cpr|gel|amp|fl|dos|inh)\b"
|
||
if re.search(_RE_DOSAGE, ctx_before, re.IGNORECASE):
|
||
continue
|
||
if re.search(_RE_DOSAGE, ctx_after, re.IGNORECASE):
|
||
continue
|
||
# Safe pattern: suivi d'une forme pharmaceutique
|
||
_RE_PHARMA_FORM = r"^\s*(?:comprim[ée]s?|g[ée]lules?|sachets?|ampoules?|flacons?|solutions?|injectable|suppo(?:sitoire)?s?|sirop|pommade|cr[eè]me|gouttes?|patch|inhal)"
|
||
if re.search(_RE_PHARMA_FORM, ctx_after, re.IGNORECASE):
|
||
continue
|
||
# Safe pattern: précédé de "taux de", "score de", "dosage de"
|
||
if re.search(r"(?:taux|score|dosage|indice|index|grade|stade|type)\s+(?:de\s+)?$", ctx_before, re.IGNORECASE):
|
||
continue
|
||
elif label == "HOPITAL":
|
||
_STRUCTURAL_WORDS = {"SERVICE", "POLE", "PÔLE", "UNITE", "UNITÉ", "SECTEUR"}
|
||
if len(w) < 5:
|
||
continue
|
||
if w.upper() in _STRUCTURAL_WORDS:
|
||
continue
|
||
placeholder = PLACEHOLDERS.get(mapped_key, PLACEHOLDERS["MASK"])
|
||
audit.append(PiiHit(-1, f"EDS_{label}", w, placeholder))
|
||
out = repl_once(out, w, placeholder)
|
||
return out
|
||
|
||
|
||
def apply_eds_pseudo_on_narrative(text_out: str, cfg: Dict[str, Any], manager: "EdsPseudoManager",
|
||
gliner_mgr: Any = None,
|
||
camembert_mgr: Any = None) -> Tuple[str, List[PiiHit]]:
|
||
"""Applique EDS-Pseudo sur le narratif avec validation croisée GLiNER optionnelle."""
|
||
if manager is None or not manager.is_loaded():
|
||
return text_out, []
|
||
# isoler [TABLES]
|
||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||
keep = []
|
||
last = 0
|
||
cleaned = ""
|
||
for m in pattern.finditer(text_out):
|
||
cleaned += text_out[last:m.start()]
|
||
keep.append((len(cleaned), len(cleaned) + len(m.group(0)), m.group(0)))
|
||
cleaned += "\x00" * len(m.group(0))
|
||
last = m.end()
|
||
cleaned += text_out[last:]
|
||
|
||
# par pages → par paragraphes
|
||
pages = cleaned.split("\f")
|
||
hits: List[PiiHit] = []
|
||
rebuilt_pages: List[str] = []
|
||
for pg in pages:
|
||
paras = [p for p in re.split(r"\n\s*\n", pg) if p.strip()]
|
||
ents_per_para = manager.infer_paragraphs(paras)
|
||
# Chantier 4 : Validation croisée GLiNER (vote majoritaire)
|
||
if gliner_mgr is not None and hasattr(gliner_mgr, 'validate_entities') and gliner_mgr.is_loaded():
|
||
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
|
||
ents_per_para[i] = gliner_mgr.validate_entities(para, ents, threshold=0.4)
|
||
# Chantier 5 : Validation croisée CamemBERT-bio (vote NER fine-tuné)
|
||
if camembert_mgr is not None and hasattr(camembert_mgr, 'validate_eds_entities') and camembert_mgr.is_loaded():
|
||
for i, (para, ents) in enumerate(zip(paras, ents_per_para)):
|
||
ents_per_para[i] = camembert_mgr.validate_eds_entities(para, ents, threshold=0.3)
|
||
buf = []
|
||
for para, ents in zip(paras, ents_per_para):
|
||
masked = _mask_with_eds_pseudo(para, ents, cfg, hits)
|
||
buf.append(masked)
|
||
rebuilt_pages.append("\n\n".join(buf))
|
||
rebuilt = "\f".join(rebuilt_pages)
|
||
|
||
# réinsérer [TABLES]
|
||
rebuilt_list = list(rebuilt)
|
||
for start, end, payload in keep:
|
||
rebuilt_list[start:end] = list(payload)
|
||
final = "".join(rebuilt_list)
|
||
return final, hits
|
||
|
||
|
||
# ----------------- NER-first: run NER on original (unmasked) text -----------------
|
||
|
||
def _run_ner_on_original_text(
|
||
pages_text: List[str],
|
||
eds_pseudo_mgr=None,
|
||
gliner_mgr=None,
|
||
camembert_mgr=None,
|
||
cfg: Dict[str, Any] = None,
|
||
) -> List[NerDetection]:
|
||
"""Exécute les modèles NER disponibles sur le texte original (non masqué)
|
||
et retourne une liste dédupliquée de NerDetection.
|
||
|
||
Cette fonction est conçue pour l'architecture NER-first : on fait tourner
|
||
les NER *avant* le masquage regex, afin de pouvoir valider les candidats
|
||
de noms extraits par regex avec les signaux NER.
|
||
|
||
Paramètres:
|
||
pages_text: texte original par page (non masqué)
|
||
eds_pseudo_mgr: instance EdsPseudoManager (optionnelle)
|
||
gliner_mgr: instance GlinerManager (optionnelle)
|
||
camembert_mgr: instance CamembertNerManager (optionnelle)
|
||
cfg: configuration (non utilisée pour l'instant, réservée)
|
||
|
||
Returns:
|
||
Liste de NerDetection dédupliquée (par token+label+page+source).
|
||
"""
|
||
detections: List[NerDetection] = []
|
||
seen: set = set() # (token_lower, label, page_idx, source) pour dédoublonnage
|
||
|
||
def _add_detection(token: str, label: str, score: float, page_idx: int, source: str):
|
||
"""Ajoute une détection si non déjà vue."""
|
||
key = (token.lower(), label, page_idx, source)
|
||
if key not in seen:
|
||
seen.add(key)
|
||
detections.append(NerDetection(
|
||
token=token, label=label, score=score,
|
||
page_idx=page_idx, source=source,
|
||
))
|
||
|
||
# Mapping des labels NER vers labels normalisés
|
||
_EDS_LABEL_NORM = {
|
||
"NOM": "NOM", "PRENOM": "PRENOM",
|
||
"HOPITAL": "HOPITAL", "VILLE": "VILLE",
|
||
"ADRESSE": "LOC", "ZIP": "LOC",
|
||
}
|
||
_GLINER_LABEL_NORM = {
|
||
"person_name": "NOM", "hospital": "HOPITAL",
|
||
"city": "VILLE", "postal_address": "LOC",
|
||
}
|
||
_CAMEMBERT_LABEL_NORM = {
|
||
"PER": "NOM", "HOPITAL": "HOPITAL",
|
||
"VILLE": "VILLE", "ADRESSE": "LOC", "ZIP": "LOC",
|
||
}
|
||
|
||
for page_idx, page_text in enumerate(pages_text):
|
||
if not page_text.strip():
|
||
continue
|
||
|
||
# Découper en paragraphes (comme apply_eds_pseudo_on_narrative)
|
||
paras = [p for p in re.split(r"\n\s*\n", page_text) if p.strip()]
|
||
|
||
# --- EDS-Pseudo ---
|
||
if eds_pseudo_mgr is not None and hasattr(eds_pseudo_mgr, 'infer_paragraphs') and eds_pseudo_mgr.is_loaded():
|
||
try:
|
||
ents_per_para = eds_pseudo_mgr.infer_paragraphs(paras)
|
||
for para_ents in ents_per_para:
|
||
for ent in para_ents:
|
||
raw_label = ent.get("entity_group", "")
|
||
norm_label = _EDS_LABEL_NORM.get(raw_label)
|
||
if norm_label:
|
||
_add_detection(
|
||
token=ent.get("word", ""),
|
||
label=norm_label,
|
||
score=ent.get("score", 0.0),
|
||
page_idx=page_idx,
|
||
source="eds_pseudo",
|
||
)
|
||
except Exception as e:
|
||
log.warning(f"_run_ner_on_original_text: EDS-Pseudo erreur page {page_idx}: {e}")
|
||
|
||
# --- GLiNER ---
|
||
if gliner_mgr is not None and hasattr(gliner_mgr, 'predict') and gliner_mgr.is_loaded():
|
||
try:
|
||
# GLiNER sur le texte complet de la page (pas par paragraphe)
|
||
gliner_ents = gliner_mgr.predict(page_text, threshold=0.4)
|
||
for ent in gliner_ents:
|
||
raw_label = ent.get("label", "")
|
||
norm_label = _GLINER_LABEL_NORM.get(raw_label)
|
||
if norm_label:
|
||
_add_detection(
|
||
token=ent.get("text", ""),
|
||
label=norm_label,
|
||
score=ent.get("score", 0.0),
|
||
page_idx=page_idx,
|
||
source="gliner",
|
||
)
|
||
except Exception as e:
|
||
log.warning(f"_run_ner_on_original_text: GLiNER erreur page {page_idx}: {e}")
|
||
|
||
# --- CamemBERT-bio ---
|
||
if camembert_mgr is not None and hasattr(camembert_mgr, 'predict_long') and camembert_mgr.is_loaded():
|
||
try:
|
||
cam_ents = camembert_mgr.predict_long(page_text, threshold=0.4)
|
||
for ent in cam_ents:
|
||
raw_label = ent.get("label", "")
|
||
norm_label = _CAMEMBERT_LABEL_NORM.get(raw_label)
|
||
if norm_label:
|
||
_add_detection(
|
||
token=ent.get("word", ""),
|
||
label=norm_label,
|
||
score=ent.get("score", 0.0),
|
||
page_idx=page_idx,
|
||
source="camembert_bio",
|
||
)
|
||
except Exception as e:
|
||
log.warning(f"_run_ner_on_original_text: CamemBERT-bio erreur page {page_idx}: {e}")
|
||
|
||
log.info(f"NER-first: {len(detections)} détections sur {len(pages_text)} pages "
|
||
f"(eds={sum(1 for d in detections if d.source == 'eds_pseudo')}, "
|
||
f"gliner={sum(1 for d in detections if d.source == 'gliner')}, "
|
||
f"camembert={sum(1 for d in detections if d.source == 'camembert_bio')})")
|
||
return detections
|
||
|
||
|
||
# ----------------- FINESS Aho-Corasick establishment matching -----------------
|
||
|
||
def _build_finess_ac():
|
||
"""Construit l'automate Aho-Corasick FINESS (appelé en lazy au premier besoin)."""
|
||
global _FINESS_AC
|
||
if not _AHO_AVAILABLE:
|
||
return
|
||
data_dir = Path(__file__).parent / "data" / "finess"
|
||
dist_path = data_dir / "etablissements_distinctifs.txt"
|
||
if not dist_path.exists():
|
||
return
|
||
|
||
# Mots génériques qui ne doivent jamais être matchés seuls
|
||
_ac_generic_blacklist = {
|
||
# Types d'établissements
|
||
"clinique", "pharmacie", "hopital", "centre", "foyer",
|
||
"residence", "maison", "cabinet", "service", "laboratoire",
|
||
"institut", "association", "fondation", "mutuelle", "polyclinique",
|
||
"dispensaire", "hospice", "annexe", "antenne", "site",
|
||
# Mots français courants qui sont aussi des noms d'établissements
|
||
"collegiale", "collegial", "cathedral", "cathedrale",
|
||
"providence", "esperance", "renaissance", "liberation",
|
||
"republique", "fraternite", "solidarite", "independance",
|
||
"beauregard", "bellevue", "belvedere",
|
||
"promenade", "esplanade", "corniche", "prefecture",
|
||
"croissant", "confluence", "bienvenue",
|
||
"chartreuse", "commanderie", "chapelle", "basilique",
|
||
"departement", "departementale", "communautaire",
|
||
# Spécialités médicales / termes cliniques courants
|
||
"chirurgicale", "radiologie", "addictologie", "prevention",
|
||
"psychotherapique", "ambulatoire", "hospitalisation",
|
||
"consultation", "surveillance", "therapeutique",
|
||
"readaptation", "reeducation", "reanimation",
|
||
"specialisee", "conventionnelle", "professionnelle",
|
||
"informatique", "administrative", "regionale",
|
||
# Mots communs
|
||
"generation", "revolution", "assomption", "visitation",
|
||
"consolation", "atlantique", "manutention", "prefiguration",
|
||
"intervalle", "pharmaciens", "pharmacien", "transfert",
|
||
"comprimee", "comprimees", "injectable", "injectables",
|
||
"maintenant", "actuellement", "auparavant", "prochainement",
|
||
"rapidement", "correctement", "directement", "simplement",
|
||
"internationale", "international", "intercommunal", "intercommunale",
|
||
# Termes médicaux homonymes d'établissements FINESS (retour relecteur 2026-03-17)
|
||
"resistance", "radiotherapie", "chimiotherapie", "curietherapie",
|
||
"hormonotherapie", "immunotherapie", "kinesitherapie",
|
||
"ergotherapie", "orthophonie", "psychomotricite",
|
||
"reeducation", "readaptation", "convalescence",
|
||
"dependance", "autonomie", "gerontologie",
|
||
}
|
||
# Expressions multi-mots trop génériques
|
||
_ac_generic_phrases = {
|
||
"a domicile", "au domicile", "menage a domicile",
|
||
"du nord", "du sud", "de l est", "de l ouest",
|
||
"la maison", "la residence", "les jardins",
|
||
"le village", "le parc", "la colline",
|
||
"au soleil", "en france",
|
||
# Expressions médicales homonymes d'établissements FINESS (FP relecteur 2026-03-16)
|
||
"long cours", "au long cours",
|
||
"le bourg", "le val", "le clos", "le mas",
|
||
"les pins", "les chenes", "les oliviers",
|
||
}
|
||
# Whitelist explicite de mono-mots < 10 chars considérés comme distinctifs
|
||
# (sinon rejetés par le filtre général). Exemple : EMBRUNS (7 chars).
|
||
# Alimentée depuis data/finess/mono_mots_distinctifs.txt — curation manuelle.
|
||
mono_file = data_dir / "mono_mots_distinctifs.txt"
|
||
mono_whitelist: set = set()
|
||
if mono_file.exists():
|
||
try:
|
||
for _line in mono_file.read_text(encoding="utf-8").splitlines():
|
||
_w = _line.strip()
|
||
if _w and not _w.startswith("#"):
|
||
mono_whitelist.add(_w.lower())
|
||
log.info(f"FINESS mono-mots distinctifs whitelist: {len(mono_whitelist)} entrées")
|
||
except Exception as _exc:
|
||
log.warning(f"Erreur chargement mono_mots_distinctifs.txt : {_exc}")
|
||
|
||
try:
|
||
ac = _ahocorasick.Automaton()
|
||
count = 0
|
||
for line in dist_path.read_text(encoding="utf-8").splitlines():
|
||
name = line.strip()
|
||
if not name:
|
||
continue
|
||
# Exclure les mots génériques seuls
|
||
if name in _ac_generic_blacklist:
|
||
continue
|
||
# Exclure les expressions génériques
|
||
if name in _ac_generic_phrases:
|
||
continue
|
||
words = name.split()
|
||
# Exclure les 2-mots dont le 1er est générique ET le 2e < 5 chars
|
||
if len(words) == 2 and words[0] in _ac_generic_blacklist and len(words[1]) < 5:
|
||
continue
|
||
# Exclure les noms de personnes (prenom + nom, 2 mots courts)
|
||
# Les prénoms composés "jean ...", "marie ..." ne sont pas des établissements distinctifs
|
||
_PRENOM_PREFIXES = {"jean", "marie", "louis", "pierre", "saint", "sainte"}
|
||
if len(words) == 2 and words[0] in _PRENOM_PREFIXES and len(words[1]) < 10:
|
||
continue
|
||
# Filtrer : >= 8 chars et >= 2 mots, OU >= 10 chars pour 1 mot,
|
||
# OU présent dans la whitelist explicite de mono-mots distinctifs.
|
||
if len(words) >= 2 and len(name) >= 8:
|
||
# Exclure les multi-mots dont TOUS les mots sont dans le stop words médical
|
||
if all(w in _MEDICAL_STOP_WORDS_SET or len(w) <= 2 for w in words):
|
||
continue
|
||
ac.add_word(name, name)
|
||
count += 1
|
||
elif len(words) == 1 and (len(name) >= 10 or name in mono_whitelist):
|
||
if (name not in _ac_generic_blacklist
|
||
and name not in _MEDICAL_STOP_WORDS_SET
|
||
and _normalize_for_matching(name) not in _MEDICAL_STOP_WORDS_SET):
|
||
ac.add_word(name, name)
|
||
count += 1
|
||
ac.make_automaton()
|
||
_FINESS_AC = ac
|
||
log.info(f"Gazetteer FINESS Aho-Corasick: {count} patterns chargés")
|
||
except Exception as e:
|
||
log.warning(f"Erreur construction FINESS Aho-Corasick: {e}")
|
||
|
||
|
||
def _normalize_positional(text: str) -> str:
|
||
"""Normalise en préservant la longueur : lowercase + accents → base char.
|
||
|
||
Chaque caractère accentué est remplacé par sa version sans accent.
|
||
Les caractères non-alphanumériques restent tels quels (même position).
|
||
Longueur de sortie == longueur d'entrée.
|
||
"""
|
||
import unicodedata
|
||
out = []
|
||
for ch in text:
|
||
# Lowercase
|
||
ch = ch.lower()
|
||
# Décomposer et retirer les accents
|
||
decomposed = unicodedata.normalize("NFD", ch)
|
||
base = "".join(c for c in decomposed if unicodedata.category(c) != "Mn")
|
||
out.append(base if base else ch)
|
||
return "".join(out)
|
||
|
||
|
||
def _mask_finess_establishments(text: str, return_matched_names: bool = False):
|
||
"""Masque les noms d'établissements FINESS détectés par Aho-Corasick.
|
||
|
||
Scanne le texte normalisé (position-preserving: même longueur) et remplace
|
||
les occurrences trouvées dans le texte original par [ETABLISSEMENT].
|
||
Seuls les matches sur des frontières de mots sont acceptés.
|
||
|
||
Si return_matched_names=True, retourne un tuple (texte_masqué, [noms_originaux]).
|
||
Sinon retourne juste le texte masqué (compatibilité ascendante).
|
||
"""
|
||
global _FINESS_AC
|
||
if _FINESS_AC is None:
|
||
_build_finess_ac()
|
||
if _FINESS_AC is None:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
normalized = _normalize_positional(text)
|
||
placeholder = PLACEHOLDERS["ETAB"]
|
||
|
||
# Collecter les matches Aho-Corasick (position fin, nom)
|
||
matches = []
|
||
for end_idx, name in _FINESS_AC.iter(normalized):
|
||
start_idx = end_idx - len(name) + 1
|
||
# Vérifier frontières de mots (pas au milieu d'un mot)
|
||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||
continue
|
||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||
continue
|
||
# Vérifier que ce n'est pas déjà dans un placeholder
|
||
ctx_before = text[max(0, start_idx - 1):start_idx]
|
||
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
|
||
if "[" in ctx_before or "]" in ctx_after:
|
||
continue
|
||
matches.append((start_idx, end_idx + 1, name))
|
||
|
||
# Filtre contextuel : exclure les matches FINESS dans un contexte anatomique/chirurgical
|
||
# ("voie biliaire principale" → "principale" est un établissement FINESS, pas ici)
|
||
_FINESS_ANATOMICAL_CTX = re.compile(
|
||
r"(?:biliaire|m[ée]sent[ée]rique|abdominale?|chirurgicale?|h[ée]patique|"
|
||
r"pulmonaire|voie|art[èe]re|veine|fistule|fracture|l[ée]sion|muqueuse|paroi|"
|
||
r"tissus?|muscle|ligament|membrane|thoracique|pelvien|pancr[ée]atique|"
|
||
r"bronchique|intestinale?|c[ée]r[ée]brale?|vasculaire|digestive?)",
|
||
re.IGNORECASE,
|
||
)
|
||
filtered_matches = []
|
||
for start, end, name in matches:
|
||
ctx_around = text[max(0, start - 50):start] + text[end:min(len(text), end + 50)]
|
||
if _FINESS_ANATOMICAL_CTX.search(ctx_around):
|
||
continue # Contexte anatomique → pas un nom d'établissement
|
||
filtered_matches.append((start, end, name))
|
||
matches = filtered_matches
|
||
|
||
if not matches:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
# Trier par position, dédupliquer (garder le plus long en cas de chevauchement)
|
||
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||
deduped = []
|
||
last_end = 0
|
||
for start, end, name in matches:
|
||
if start >= last_end:
|
||
deduped.append((start, end, name))
|
||
last_end = end
|
||
|
||
# Reconstruire le texte avec les remplacements (positions 1:1 avec l'original)
|
||
result = []
|
||
matched_names = []
|
||
last_pos = 0
|
||
for start, end, name in deduped:
|
||
if start > len(text) or end > len(text):
|
||
continue
|
||
# Extraire le texte original (avec accents, casse d'origine) pour l'audit
|
||
original_text = text[start:end]
|
||
matched_names.append(original_text)
|
||
result.append(text[last_pos:start])
|
||
result.append(placeholder)
|
||
last_pos = end
|
||
result.append(text[last_pos:])
|
||
|
||
masked = "".join(result)
|
||
return (masked, matched_names) if return_matched_names else masked
|
||
|
||
|
||
# ----------------- FINESS Address Aho-Corasick matching -----------------
|
||
|
||
|
||
def _normalize_addr_with_posmap(text: str):
|
||
"""Normalise pour matching adresses : lowercase, accents, ponctuation → espace, collapse.
|
||
|
||
Retourne (normalized, posmap) où posmap[i] = position dans le texte original du char i.
|
||
Compatible avec les patterns de adresses_finess.txt (même normalisation).
|
||
"""
|
||
import unicodedata as _ud
|
||
# Étape 1 : lowercase + strip accents, en gardant la longueur
|
||
buf = []
|
||
for ch in text:
|
||
ch_low = ch.lower()
|
||
decomposed = _ud.normalize("NFD", ch_low)
|
||
base = "".join(c for c in decomposed if _ud.category(c) != "Mn")
|
||
buf.append(base if base else ch_low)
|
||
# Étape 2 : non-alnum → espace, avec position tracking
|
||
step2 = []
|
||
for i, ch in enumerate(buf):
|
||
if ch.isalnum() or ch == ' ':
|
||
step2.append((ch, i))
|
||
else:
|
||
step2.append((' ', i))
|
||
# Étape 3 : collapse espaces multiples
|
||
result_chars = []
|
||
posmap = []
|
||
prev_space = False
|
||
for ch, orig_pos in step2:
|
||
if ch == ' ':
|
||
if not prev_space:
|
||
result_chars.append(' ')
|
||
posmap.append(orig_pos)
|
||
prev_space = True
|
||
else:
|
||
result_chars.append(ch)
|
||
posmap.append(orig_pos)
|
||
prev_space = False
|
||
# Strip leading/trailing
|
||
normalized = "".join(result_chars)
|
||
leading = len(normalized) - len(normalized.lstrip())
|
||
normalized = normalized.strip()
|
||
posmap = posmap[leading:leading + len(normalized)]
|
||
return normalized, posmap
|
||
|
||
|
||
def _build_finess_addr_ac():
|
||
"""Construit l'automate Aho-Corasick pour les adresses FINESS."""
|
||
global _FINESS_ADDR_AC
|
||
if not _AHO_AVAILABLE:
|
||
return
|
||
data_dir = Path(__file__).parent / "data" / "finess"
|
||
addr_path = data_dir / "adresses_finess.txt"
|
||
if not addr_path.exists():
|
||
return
|
||
try:
|
||
ac = _ahocorasick.Automaton()
|
||
count = 0
|
||
# Types de voie reconnus (patterns avec préfixe = plus fiables)
|
||
_voie_types = {"rue", "avenue", "boulevard", "route", "chemin", "place",
|
||
"impasse", "allee", "square", "passage", "quai", "cours",
|
||
"sentier", "rond-point", "traverse", "esplanade",
|
||
"promenade", "montee", "voie", "carrefour", "faubourg"}
|
||
# Patterns non-adresse à exclure
|
||
_addr_blacklist = {"cabinet medical", "cabinet dentaire", "cabinet infirmier",
|
||
"cabinet paramedical", "cabinet sage-femme"}
|
||
for line in addr_path.read_text(encoding="utf-8").splitlines():
|
||
name = line.strip()
|
||
if not name or len(name) < 10:
|
||
continue
|
||
if name in _addr_blacklist:
|
||
continue
|
||
words = name.split()
|
||
if all(w in _MEDICAL_STOP_WORDS_SET or len(w) <= 2 for w in words):
|
||
continue
|
||
# Patterns sans type de voie : exiger >= 20 chars (éviter noms de personnes)
|
||
has_voie_prefix = words[0] in _voie_types
|
||
if not has_voie_prefix and len(name) < 20:
|
||
continue
|
||
ac.add_word(name, name)
|
||
count += 1
|
||
ac.make_automaton()
|
||
_FINESS_ADDR_AC = ac
|
||
log.info(f"Gazetteer FINESS adresses: {count} patterns chargés")
|
||
except Exception as e:
|
||
log.warning(f"Erreur construction FINESS adresses Aho-Corasick: {e}")
|
||
|
||
|
||
def _mask_finess_addresses(text: str, return_matched_names: bool = False):
|
||
"""Masque les adresses FINESS détectées par Aho-Corasick.
|
||
|
||
Utilise une normalisation avec position-map pour gérer apostrophes, points,
|
||
et autres caractères non-alphanumériques courants dans les adresses.
|
||
"""
|
||
global _FINESS_ADDR_AC
|
||
if _FINESS_ADDR_AC is None:
|
||
_build_finess_addr_ac()
|
||
if _FINESS_ADDR_AC is None:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
normalized, posmap = _normalize_addr_with_posmap(text)
|
||
placeholder = PLACEHOLDERS.get("ADRESSE", "[ADRESSE]")
|
||
|
||
matches = []
|
||
for end_idx, name in _FINESS_ADDR_AC.iter(normalized):
|
||
start_idx = end_idx - len(name) + 1
|
||
# Vérifier frontières de mots dans le texte normalisé
|
||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||
continue
|
||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||
continue
|
||
# Mapper vers positions originales
|
||
if start_idx >= len(posmap) or end_idx >= len(posmap):
|
||
continue
|
||
orig_start = posmap[start_idx]
|
||
orig_end = posmap[end_idx] + 1
|
||
# Pas déjà dans un placeholder
|
||
ctx_before = text[max(0, orig_start - 1):orig_start]
|
||
ctx_after = text[orig_end:min(len(text), orig_end + 1)]
|
||
if "[" in ctx_before or "]" in ctx_after:
|
||
continue
|
||
matches.append((orig_start, orig_end, name))
|
||
|
||
if not matches:
|
||
return (text, []) if return_matched_names else text
|
||
|
||
# Garder les plus longs en cas de chevauchement
|
||
matches.sort(key=lambda x: (x[0], -(x[1] - x[0])))
|
||
deduped = []
|
||
last_end = 0
|
||
for start, end, name in matches:
|
||
if start >= last_end:
|
||
deduped.append((start, end, name))
|
||
last_end = end
|
||
|
||
result = []
|
||
matched_names = []
|
||
last_pos = 0
|
||
for start, end, name in deduped:
|
||
if start > len(text) or end > len(text):
|
||
continue
|
||
original_text = text[start:end]
|
||
matched_names.append(original_text)
|
||
# Étendre vers la gauche pour capturer le numéro de voie (ex: "13, ")
|
||
ext_start = start
|
||
prefix = text[max(0, start - 15):start]
|
||
num_match = re.search(r'(\d+\s*[,.]?\s*)$', prefix)
|
||
if num_match:
|
||
ext_start = start - (len(prefix) - num_match.start())
|
||
# Étendre vers la droite pour capturer BP/CS + code postal + ville
|
||
ext_end = end
|
||
suffix = text[end:min(len(text), end + 60)]
|
||
# BP/CS + numéro + éventuel code postal + ville
|
||
bp_match = re.match(
|
||
r'(\s*(?:BP|CS)\s*\d+\s*[,.]?\s*(?:\d{5}\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-]+(?:CEDEX)?)?)',
|
||
suffix, re.IGNORECASE)
|
||
if bp_match:
|
||
ext_end = end + len(bp_match.group(1).rstrip())
|
||
result.append(text[last_pos:ext_start])
|
||
result.append(placeholder)
|
||
last_pos = ext_end
|
||
result.append(text[last_pos:])
|
||
|
||
masked = "".join(result)
|
||
return (masked, matched_names) if return_matched_names else masked
|
||
|
||
|
||
# ----------------- Ville Aho-Corasick gazetteer matching -----------------
|
||
|
||
def _build_ville_ac():
|
||
"""Construit l'automate Aho-Corasick pour les villes (INSEE + FINESS).
|
||
|
||
Appelé en lazy au premier besoin.
|
||
Les noms sont normalisés sans accents (position-preserving matching via _normalize_positional).
|
||
NOTE : on ne filtre PAS par _MEDICAL_STOP_WORDS_SET car ces villes y ont été ajoutées
|
||
pour empêcher leur détection comme NOMS DE PERSONNES, pas pour empêcher leur détection
|
||
comme villes. Le filtrage anti-faux-positifs se fait via _VILLE_BLACKLIST et le seuil
|
||
de longueur minimale.
|
||
"""
|
||
global _VILLE_AC
|
||
if not _AHO_AVAILABLE:
|
||
return
|
||
|
||
# Combiner les deux sources de villes
|
||
all_villes: set = set()
|
||
if _INSEE_COMMUNES:
|
||
all_villes.update(_INSEE_COMMUNES)
|
||
if _FINESS_VILLES:
|
||
all_villes.update(v.upper() for v in _FINESS_VILLES)
|
||
|
||
if not all_villes:
|
||
log.warning("Aucune ville disponible pour l'automate Aho-Corasick VILLE")
|
||
return
|
||
|
||
try:
|
||
ac = _ahocorasick.Automaton()
|
||
count = 0
|
||
added_normalized: set = set() # éviter les doublons après normalisation
|
||
for ville in all_villes:
|
||
ville = ville.strip()
|
||
if not ville:
|
||
continue
|
||
# Blacklist de communes ambiguës
|
||
if ville.upper() in _VILLE_BLACKLIST:
|
||
continue
|
||
# Les noms composés dans les gazetteers utilisent des espaces ("MONT DE MARSAN")
|
||
# mais dans les textes ils apparaissent souvent avec des tirets ("Mont-de-Marsan").
|
||
# On ajoute les deux variantes dans l'automate.
|
||
words = ville.split()
|
||
# Filtre longueur minimale (mono-mot < 4 chars → trop ambigu)
|
||
# Exception : quelques villes de 3 lettres notables
|
||
_VILLE_3CHAR_ALLOW = {"DAX", "PAU", "GAP", "APT", "GEX", "LUZ"}
|
||
if len(words) == 1 and len(ville) < 4 and ville.upper() not in _VILLE_3CHAR_ALLOW:
|
||
continue
|
||
# Normaliser sans accents, en lowercase (pour matching positionnel)
|
||
normalized_ville = _normalize_positional(ville)
|
||
if normalized_ville not in added_normalized:
|
||
ac.add_word(normalized_ville, (normalized_ville, ville))
|
||
added_normalized.add(normalized_ville)
|
||
count += 1
|
||
|
||
def _add_variant(variant_norm: str) -> None:
|
||
nonlocal count
|
||
if variant_norm and variant_norm not in added_normalized:
|
||
ac.add_word(variant_norm, (variant_norm, ville))
|
||
added_normalized.add(variant_norm)
|
||
count += 1
|
||
|
||
# Variante avec tirets pour les noms composés (ex: "mont de marsan" → "mont-de-marsan")
|
||
if len(words) >= 2:
|
||
_add_variant(_normalize_positional("-".join(words)))
|
||
# Variante SAINT ↔ ST (gazetteers INSEE utilisent "ST", textes "Saint")
|
||
for prefix_src, prefix_dst in [("ST ", "SAINT "), ("ST ", "SAINT-"),
|
||
("SAINT ", "ST "), ("SAINT ", "ST-"),
|
||
("STE ", "SAINTE "), ("STE ", "SAINTE-"),
|
||
("SAINTE ", "STE "), ("SAINTE ", "STE-")]:
|
||
if ville.startswith(prefix_src):
|
||
alt = prefix_dst + ville[len(prefix_src):]
|
||
_add_variant(_normalize_positional(alt))
|
||
_add_variant(_normalize_positional("-".join(alt.split())))
|
||
ac.make_automaton()
|
||
_VILLE_AC = ac
|
||
log.info(f"Gazetteer VILLE Aho-Corasick: {count} patterns chargés "
|
||
f"(INSEE: {len(_INSEE_COMMUNES)}, FINESS: {len(_FINESS_VILLES)})")
|
||
except Exception as e:
|
||
log.warning(f"Erreur construction VILLE Aho-Corasick: {e}")
|
||
|
||
|
||
def _mask_ville_gazetteers(text: str) -> tuple:
|
||
"""Masque les villes détectées par Aho-Corasick dans le texte narratif.
|
||
|
||
Stratégie contextuelle : pour éviter les faux positifs massifs (CHARGE, SIGNES,
|
||
TALON — communes homonymes de mots courants), on ne masque une ville que si :
|
||
- C'est une ville composée (Saint-Palais), OU
|
||
- C'est une ville très longue (>= 8 lettres : Bordeaux, Toulouse), OU
|
||
- Elle apparaît dans un contexte géographique explicite (à, de, vers, habite, etc.)
|
||
|
||
Returns: (texte_masqué, liste_des_valeurs_originales_masquées)
|
||
"""
|
||
global _VILLE_AC
|
||
if _VILLE_AC is None:
|
||
_build_ville_ac()
|
||
if _VILLE_AC is None:
|
||
return text
|
||
|
||
normalized = _normalize_positional(text)
|
||
placeholder = PLACEHOLDERS["VILLE"]
|
||
|
||
# Contextes géographiques avant une ville
|
||
# NOTE : "de" seul est trop ambigu ("prise de selles", "nombre de jumeaux")
|
||
# On exige "de" uniquement après un verbe/nom géographique ou une préposition composée
|
||
_RE_GEO_BEFORE = re.compile(
|
||
r"(?:"
|
||
# Préposition "à" (très spécifique géographiquement)
|
||
r"[àÀ]\s+|"
|
||
# "de" seulement dans un contexte géographique (vient de, originaire de, etc.)
|
||
r"(?:vient|venant|arrivant|provenant|originaire|issu(?:e)?)\s+(?:de\s+|d['']\s*)|"
|
||
# "urgences de", "hôpital de", "clinique de", "UHCD de", etc.
|
||
r"(?:urgences?|h[oô]pital|clinique|CHU?|CH\b|UHCD|SSR|USLD|HAD|EHPAD|CSAPA|CMPP|CMP|GHT|HIA)\s+(?:de\s+|d['']\s*)|"
|
||
# Verbes de localisation directement suivis de la ville
|
||
r"(?:habit|résid|viv|domicilié(?:e)?|transféré(?:e)?|"
|
||
r"adressé(?:e)?|hospitalisé(?:e)?|opéré(?:e)?|"
|
||
r"Fait)\s+(?:à\s+|de\s+|d['']\s*)?|"
|
||
# "vers" (prép. géo directe) — NOTE: "sur" exclu car trop ambigu ("sur le plan")
|
||
r"vers\s+|"
|
||
# Après code postal (déjà masqué OU encore en chiffres) ou parenthèse ouvrante
|
||
r"\[CODE_POSTAL\]\s*|"
|
||
r"\b\d{5}\s+|"
|
||
r"\(\s*|"
|
||
# Contextes médicaux spécifiques d'adressage
|
||
r"(?:urg(?:ences?)?\s+)|"
|
||
# Après titre médical + nom masqué + tiret/virgule : "Dr [NOM] - VILLE"
|
||
r"(?:Dr\.?|Pr\.?|Docteur|Professeur)\s+\[NOM\]\s*[\-–,]\s*|"
|
||
r"\[NOM\]\s*[\-–,(]\s*|"
|
||
# Après spécialité médicale : "cardiologue Anglet", "neurologue, DAX"
|
||
r"(?:cardiologue|neurologue|radiologue|chirurgien|pneumologue|"
|
||
r"gastro-?ent[ée]rologue|oncologue|n[ée]phrologue|urologue|"
|
||
r"g[ée]riatre|dermatologue|rhumatologue|ophtalmologue|psychiatre|"
|
||
r"anesth[ée]siste|gyn[ée]cologue|p[ée]diatre|m[ée]decin|g[ée]n[ée]raliste|"
|
||
r"kin[ée]|ORL|MT)\s*[,]?\s*"
|
||
r")\s*$",
|
||
re.I,
|
||
)
|
||
|
||
# Collecter les matches Aho-Corasick
|
||
# Construire aussi un index des matches par position de début pour la passe
|
||
# "énumération" (passe 2) : une ville dont l'énumération précède un match confirmé
|
||
# doit être elle aussi masquée ("Bordeaux et Bayonne" → Bayonne via Bordeaux).
|
||
all_ac_hits: list = [] # [(start, end, orig_span), ...] — tous matches AC avant filtrage
|
||
confirmed_hits: set = set() # indices dans all_ac_hits qui ont passé le filtre contextuel
|
||
|
||
for end_idx, (norm_name, orig_name) in _VILLE_AC.iter(normalized):
|
||
start_idx = end_idx - len(norm_name) + 1
|
||
# Vérifier frontières de mots (pas au milieu d'un mot)
|
||
if start_idx > 0 and normalized[start_idx - 1].isalnum():
|
||
continue
|
||
if end_idx + 1 < len(normalized) and normalized[end_idx + 1].isalnum():
|
||
continue
|
||
# Vérifier que ce n'est pas déjà dans un placeholder [...]
|
||
ctx_before = text[max(0, start_idx - 1):start_idx]
|
||
ctx_after = text[end_idx + 1:min(len(text), end_idx + 2)]
|
||
if "[" in ctx_before or "]" in ctx_after:
|
||
continue
|
||
# Vérifier proximité placeholder (pas juste après [ETABLISSEMENT] de ...)
|
||
wide_before = text[max(0, start_idx - 25):start_idx]
|
||
if re.search(r"\[(VILLE|ADRESSE|ETABLISSEMENT)\]\s*(?:de\s+|du\s+|d['']\s*|à\s+)?$", wide_before):
|
||
continue
|
||
# Récupérer le texte original à cette position
|
||
original_span = text[start_idx:end_idx + 1]
|
||
# Extension suffixe CEDEX : si la ville est suivie de " CEDEX" ou " CEDEX N",
|
||
# capturer l'ensemble (ex: "BAYONNE CEDEX" → match complet).
|
||
_cedex_match = re.match(r"\s+CEDEX(?:\s+\d+)?\b", text[end_idx + 1:end_idx + 20])
|
||
if _cedex_match:
|
||
ext_len = _cedex_match.end()
|
||
end_idx_ext = end_idx + ext_len
|
||
original_span = text[start_idx:end_idx_ext + 1]
|
||
else:
|
||
end_idx_ext = end_idx
|
||
word_count = len(orig_name.split())
|
||
|
||
# Enregistrer tous les hits (même sans contexte géo) pour la passe énumération
|
||
all_ac_hits.append((start_idx, end_idx_ext + 1, original_span))
|
||
hit_idx = len(all_ac_hits) - 1
|
||
|
||
# Stratégie contextuelle pour éviter les FP :
|
||
# TOUJOURS exiger un contexte géographique (à, de, vers, habite, etc.)
|
||
# sauf pour les villes composées avec trait d'union (Saint-Palais,
|
||
# Mont-de-Marsan) qui sont très peu ambiguës.
|
||
is_compound_hyphen = ("-" in original_span and word_count >= 2)
|
||
if not is_compound_hyphen:
|
||
before_ctx = text[max(0, start_idx - 40):start_idx]
|
||
if not _RE_GEO_BEFORE.search(before_ctx):
|
||
continue
|
||
confirmed_hits.add(hit_idx)
|
||
|
||
# Passe 2 — énumérations : si deux hits AC sont liés par " et " ou ", ",
|
||
# se confirment mutuellement. Cas réels couverts :
|
||
# "travaille à Bordeaux et Bayonne" (ancre déjà confirmée propage)
|
||
# "Régions : Bordeaux, Bayonne, Biarritz" (aucune ancre, mais chaîne ≥2 villes
|
||
# gazetteer en énumération = forte présomption géographique)
|
||
# Itération à point fixe pour propager sur des chaînes longues.
|
||
def _enum_link(i: int, j: int) -> bool:
|
||
"""Vrai si les hits i et j sont adjacents dans une énumération."""
|
||
s_a, e_a, _ = all_ac_hits[i]
|
||
s_b, e_b, _ = all_ac_hits[j]
|
||
if s_b <= e_a:
|
||
return False
|
||
return bool(re.fullmatch(r"\s*(?:et|,)\s*", text[e_a:s_b]))
|
||
|
||
changed = True
|
||
while changed:
|
||
changed = False
|
||
for i in range(len(all_ac_hits)):
|
||
for j in range(len(all_ac_hits)):
|
||
if i == j or not _enum_link(i, j):
|
||
continue
|
||
# Cas A : i confirmé → confirmer j
|
||
if i in confirmed_hits and j not in confirmed_hits:
|
||
confirmed_hits.add(j); changed = True
|
||
# Cas B : chaîne ≥2 en énumération sans ancre → confirmer les deux.
|
||
# Garde-fou : chaque hit doit avoir au moins 5 lettres (évite de
|
||
# masquer deux mots courts homonymes de communes côte à côte).
|
||
elif (i not in confirmed_hits and j not in confirmed_hits
|
||
and len(all_ac_hits[i][2].strip()) >= 5
|
||
and len(all_ac_hits[j][2].strip()) >= 5):
|
||
confirmed_hits.add(i); confirmed_hits.add(j); changed = True
|
||
|
||
matches = [all_ac_hits[i] for i in confirmed_hits]
|
||
|
||
if not matches:
|
||
return text, []
|
||
|
||
# Dédupliquer : préférer le match le plus long en cas de chevauchement
|
||
# Trier par longueur décroissante, puis sélectionner gloutonement les non-chevauchants
|
||
matches.sort(key=lambda x: -(x[1] - x[0]))
|
||
deduped = []
|
||
for start, end, orig in matches:
|
||
# Vérifier que cet intervalle ne chevauche pas un intervalle déjà retenu
|
||
if any(s < end and start < e for s, e, _ in deduped):
|
||
continue
|
||
deduped.append((start, end, orig))
|
||
# Re-trier par position pour la reconstruction
|
||
deduped.sort(key=lambda x: x[0])
|
||
|
||
# Reconstruire le texte avec les remplacements
|
||
result = []
|
||
masked_originals = []
|
||
last_pos = 0
|
||
for start, end, orig in deduped:
|
||
if start > len(text) or end > len(text):
|
||
continue
|
||
result.append(text[last_pos:start])
|
||
result.append(placeholder)
|
||
masked_originals.append(orig)
|
||
last_pos = end
|
||
result.append(text[last_pos:])
|
||
|
||
return "".join(result), masked_originals
|
||
|
||
|
||
# ----------------- Whitelist (phrases à ne jamais anonymiser) -----------------
|
||
|
||
def _apply_whitelist(text: str, phrases: List[str], audit: List[PiiHit]) -> str:
|
||
"""Restaure les phrases whitelistées qui ont été masquées à tort.
|
||
|
||
Pour chaque phrase de la whitelist, construit un pattern flexible qui
|
||
accepte des placeholders [XXX] entre les mots originaux.
|
||
Ex: "bas de contention" matche "bas [NOM] contention" ou "bas de [NOM]".
|
||
"""
|
||
_PH = r"\[[A-Z_]+\]" # placeholder pattern
|
||
|
||
for phrase in phrases:
|
||
if not phrase or not phrase.strip():
|
||
continue
|
||
words = phrase.strip().split()
|
||
if len(words) < 2:
|
||
continue
|
||
|
||
# Construire un pattern où chaque mot de la phrase peut être
|
||
# remplacé par un placeholder OU être présent tel quel.
|
||
# Entre les mots : espace(s) optionnel(s)
|
||
parts = []
|
||
for w in words:
|
||
# Le mot original OU un placeholder
|
||
parts.append(rf"(?:{re.escape(w)}|{_PH})")
|
||
# Joindre avec des espaces flexibles
|
||
pattern = r"(?i)" + r"[\s]+".join(parts)
|
||
|
||
try:
|
||
rx = re.compile(pattern)
|
||
except re.error:
|
||
continue
|
||
|
||
for m in rx.finditer(text):
|
||
matched = m.group(0)
|
||
# Ne restaurer que si au moins un placeholder est présent
|
||
# (sinon la phrase est déjà intacte, pas besoin de toucher)
|
||
if "[" in matched:
|
||
text = text[:m.start()] + phrase + text[m.end():]
|
||
|
||
return text
|
||
|
||
|
||
# ----------------- Selective safety rescan -----------------
|
||
|
||
def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||
"""Rescan de sécurité : re-détecte les PII critiques qui auraient échappé au premier passage."""
|
||
# enlève TABLES du scope
|
||
def strip_tables(s: str):
|
||
kept = []
|
||
out = []
|
||
i = 0
|
||
pattern = re.compile(r"\[TABLES\](.*?)\[/TABLES\]", re.DOTALL)
|
||
for m in pattern.finditer(s):
|
||
out.append(s[i:m.start()])
|
||
kept.append((len("".join(out)), len("".join(out)) + len(m.group(1)), m.group(1)))
|
||
out.append("\x00" * (m.end() - m.start()))
|
||
i = m.end()
|
||
out.append(s[i:])
|
||
return "".join(out), kept
|
||
protected, kept = strip_tables(text)
|
||
# PII critiques (comme avant)
|
||
# IPP et N° examen AVANT TEL pour éviter collision de labels
|
||
protected = RE_IPP.sub(lambda m: f"IPP : {PLACEHOLDERS['IPP']}", protected)
|
||
protected = RE_NUM_EXAMEN_PATIENT.sub(
|
||
lambda m: m.group(0).replace(m.group(1), PLACEHOLDERS["DOSSIER"]), protected
|
||
)
|
||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||
protected = RE_URL.sub(PLACEHOLDERS["MASK"], protected)
|
||
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
|
||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
|
||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||
# NIR avec validation
|
||
def _rescan_nir(m: re.Match) -> str:
|
||
return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0)
|
||
protected = RE_NIR.sub(_rescan_nir, protected)
|
||
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
|
||
protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected)
|
||
# protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé
|
||
protected = RE_ADRESSE.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||
protected = RE_ADRESSE_LIEU_DIT.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||
protected = RE_BP.sub(PLACEHOLDERS["ADRESSE"], protected)
|
||
protected = RE_CODE_POSTAL.sub(PLACEHOLDERS["CODE_POSTAL"], protected)
|
||
# N° Episode
|
||
protected = RE_EPISODE.sub(PLACEHOLDERS["EPISODE"], protected)
|
||
# N° venue / séjour
|
||
protected = RE_VENUE_SEJOUR.sub(PLACEHOLDERS["NDA"], protected)
|
||
# N° RPPS
|
||
protected = RE_RPPS.sub(PLACEHOLDERS["RPPS"], protected)
|
||
# FINESS par gazetteer (nombres 9 chiffres matchant un vrai numéro FINESS)
|
||
if _FINESS_NUMBERS:
|
||
def _rescan_finess(m: re.Match) -> str:
|
||
return PLACEHOLDERS["FINESS"] if m.group(1) in _FINESS_NUMBERS else m.group(0)
|
||
protected = RE_BARE_9DIGITS.sub(_rescan_finess, protected)
|
||
# Établissements (regex)
|
||
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
||
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
||
# Établissements (gazetteer Aho-Corasick FINESS — 116K noms distinctifs)
|
||
protected = _mask_finess_establishments(protected)
|
||
# Adresses (gazetteer Aho-Corasick FINESS — 28K noms de voie)
|
||
protected = _mask_finess_addresses(protected)
|
||
# Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R" → [ETABLISSEMENT]
|
||
_re_spaced = re.compile(r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]')
|
||
_spaced_kw = {"HOSPITALIER", "HOSPITALIERE", "HOSPITALIERES", "HOSPITALIERS",
|
||
"CLINIQUE", "HOPITAL", "HÔPITAL", "POLYCLINIQUE",
|
||
"CENTRE", "ETABLISSEMENT", "MAISON", "RESIDENCE",
|
||
"EHPAD", "SSR", "USLD", "CHU", "CHRU"}
|
||
for m_sp in _re_spaced.finditer(protected):
|
||
collapsed = m_sp.group(0).replace(" ", "").upper()
|
||
if any(kw in collapsed for kw in _spaced_kw):
|
||
protected = protected.replace(m_sp.group(0), PLACEHOLDERS["ETAB"], 1)
|
||
# Villes (gazetteer Aho-Corasick — INSEE + FINESS)
|
||
if _VILLE_AC is None:
|
||
_build_ville_ac()
|
||
if _VILLE_AC is not None:
|
||
protected, _ = _mask_ville_gazetteers(protected)
|
||
# Services hospitaliers
|
||
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
|
||
# Lieu de naissance / Ville de résidence (accepte tout : villes, codes INSEE, minuscules)
|
||
_re_lieu_rescan = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)(\S.+)")
|
||
protected = _re_lieu_rescan.sub(lambda m: m.group(1) + PLACEHOLDERS["VILLE"], protected)
|
||
_re_ville_rescan = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)(\S.+)")
|
||
protected = _re_ville_rescan.sub(lambda m: m.group(1) + PLACEHOLDERS["VILLE"], protected)
|
||
# Personnes contextuelles (avec whitelist)
|
||
wl_sections = set()
|
||
wl_phrases = set()
|
||
if cfg:
|
||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||
def _rescan_person(m: re.Match) -> str:
|
||
span = m.group(1).strip(); raw = m.group(0)
|
||
if span in wl_sections or raw in wl_phrases:
|
||
return raw
|
||
tokens = [t for t in span.split() if t]
|
||
if len(tokens) == 1 and len(tokens[0]) <= 4:
|
||
return raw
|
||
# Filtrer les termes médicaux (stop words)
|
||
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
|
||
if not clean:
|
||
return raw
|
||
return raw.replace(span, PLACEHOLDERS["NOM"])
|
||
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
||
# Mr/Mme + initiale isolée : "Mme Z", "Mr R" → masquer
|
||
protected = RE_CIVILITE_INITIALE.sub(
|
||
lambda m: m.group(1) + PLACEHOLDERS["NOM"], protected
|
||
)
|
||
# Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
|
||
_re_init_nom = re.compile(r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.[\s\-]*(\[NOM\])')
|
||
protected = _re_init_nom.sub(r'[NOM] \2', protected)
|
||
# Références initiales : "Ref : JF/VA" → "Ref : [NOM]/[NOM]"
|
||
_re_ref_init = re.compile(r'(?:Ref\s*:\s*|Réf\s*:\s*)([A-Z]{1,3})\s*/\s*([A-Z]{1,3})\b')
|
||
protected = _re_ref_init.sub(
|
||
lambda m: m.group(0)[:m.group(0).index(m.group(1))] + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"],
|
||
protected,
|
||
)
|
||
res = list(protected)
|
||
for start, end, payload in kept:
|
||
res[start:end] = list(payload)
|
||
return "".join(res)
|
||
|
||
# ----------------- PDF Redaction -----------------
|
||
|
||
def _search_ocr_words_fuzzy_digits(ocr_words: List[Tuple[str, float, float, float, float]],
|
||
token: str, page_rect, min_ratio: float = 0.7) -> list:
|
||
"""Matching flou pour identifiants numériques manuscrits.
|
||
Compare les séquences de chiffres entre le token VLM et les mots OCR.
|
||
Accepte une correspondance si ≥ min_ratio des chiffres matchent."""
|
||
token_digits = re.sub(r"[^0-9]", "", token)
|
||
if len(token_digits) < 4:
|
||
return []
|
||
rects = []
|
||
for (word, x0n, y0n, x1n, y1n) in ocr_words:
|
||
word_digits = re.sub(r"[^0-9]", "", word)
|
||
if len(word_digits) < 3:
|
||
continue
|
||
# Match exact des chiffres (après nettoyage)
|
||
if word_digits == token_digits:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width, y0n * page_rect.height,
|
||
x1n * page_rect.width, y1n * page_rect.height,
|
||
))
|
||
continue
|
||
# Match partiel : le token est contenu dans le mot OCR ou vice-versa
|
||
if token_digits in word_digits or word_digits in token_digits:
|
||
if min(len(token_digits), len(word_digits)) / max(len(token_digits), len(word_digits)) >= min_ratio:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width, y0n * page_rect.height,
|
||
x1n * page_rect.width, y1n * page_rect.height,
|
||
))
|
||
continue
|
||
# Match par distance : comparer caractère par caractère (Hamming-like)
|
||
if abs(len(word_digits) - len(token_digits)) <= 2:
|
||
shorter, longer = (word_digits, token_digits) if len(word_digits) <= len(token_digits) else (token_digits, word_digits)
|
||
matches = sum(1 for a, b in zip(shorter, longer) if a == b)
|
||
if matches / len(longer) >= min_ratio:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width, y0n * page_rect.height,
|
||
x1n * page_rect.width, y1n * page_rect.height,
|
||
))
|
||
return rects
|
||
|
||
def _search_ocr_words(ocr_words: List[Tuple[str, float, float, float, float]], token: str, page_rect) -> list:
|
||
"""Cherche un token dans les mots OCR d'une page.
|
||
Pour les tokens multi-mots, cherche chaque mot individuellement.
|
||
Retourne des fitz.Rect en coordonnées PDF points."""
|
||
rects = []
|
||
tokens_to_search = token.split() if " " in token else [token]
|
||
for t in tokens_to_search:
|
||
t_lower = t.lower().strip()
|
||
if not t_lower:
|
||
continue
|
||
for (word, x0n, y0n, x1n, y1n) in ocr_words:
|
||
if word.lower().strip(".,;:!?()") == t_lower:
|
||
rects.append(fitz.Rect(
|
||
x0n * page_rect.width,
|
||
y0n * page_rect.height,
|
||
x1n * page_rect.width,
|
||
y1n * page_rect.height,
|
||
))
|
||
return rects
|
||
|
||
def _search_whole_word(page, token: str) -> list:
|
||
"""Cherche un token comme mot entier (pas substring) via get_text('words').
|
||
Évite les faux positifs de page.search_for() qui fait du substring matching.
|
||
Gère les noms composés (JEAN-PIERRE) qui peuvent être splittés par le PDF."""
|
||
rects = []
|
||
token_lower = token.lower().strip()
|
||
words = page.get_text("words")
|
||
for w in words:
|
||
# w = (x0, y0, x1, y1, word, block_no, line_no, word_no)
|
||
word_text = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||
if word_text.lower() == token_lower:
|
||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||
# Fallback pour noms composés avec tiret (JEAN-PIERRE) splittés par le PDF
|
||
if not rects and "-" in token:
|
||
parts = [p for p in token.split("-") if p]
|
||
if len(parts) >= 2:
|
||
for p in parts:
|
||
p_lower = p.lower().strip()
|
||
if len(p_lower) < 2:
|
||
continue
|
||
for w in words:
|
||
wt = w[4].strip(".,;:!?()[]{}\"'«»-–—/\\")
|
||
if wt.lower() == p_lower:
|
||
rects.append(fitz.Rect(w[0], w[1], w[2], w[3]))
|
||
return rects
|
||
|
||
def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, ocr_word_map: OcrWordMap = None) -> None:
|
||
if fitz is None:
|
||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||
doc = fitz.open(str(original_pdf))
|
||
# index hits par page; page==-1 → rechercher sur toutes pages
|
||
by_page: Dict[int, List[PiiHit]] = {}
|
||
for h in audit:
|
||
by_page.setdefault(h.page, []).append(h)
|
||
# Kinds à ne pas chercher dans le PDF (dates masquées uniquement dans le texte,
|
||
# pas dans le PDF où elles rendent les tableaux illisibles)
|
||
_VECTOR_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||
# Kinds sensibles au substring matching : utiliser _search_whole_word
|
||
_VECTOR_WHOLEWORD_KINDS = {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
|
||
"ETAB_FINESS", "ADDR_FINESS", "NER_PER", "NER_ORG", "NER_LOC"}
|
||
for pno in range(len(doc)):
|
||
page = doc[pno]
|
||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||
if not hits:
|
||
continue
|
||
# Dédupliquer les tokens : (token, kind) → rechercher une seule fois par page
|
||
seen_tokens: set = set()
|
||
all_rects = []
|
||
for h in hits:
|
||
token = h.original.strip()
|
||
if not token:
|
||
continue
|
||
if h.kind in _VECTOR_SKIP_KINDS:
|
||
continue
|
||
# Clé de déduplication : le token lui-même (même token cherché une seule fois)
|
||
dedup_key = token
|
||
if dedup_key in seen_tokens:
|
||
continue
|
||
seen_tokens.add(dedup_key)
|
||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||
if h.kind in _VECTOR_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
|
||
if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
# Whitelist absolue : prime même sur NOM_FORCE
|
||
if token.lower() in _WHITELIST_NEVER_MASK_TOKENS:
|
||
continue
|
||
if " " not in token:
|
||
rects = _search_whole_word(page, token)
|
||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
all_rects.extend(rects)
|
||
else:
|
||
# Vérification frontières de mots (comme raster)
|
||
rects = page.search_for(token)
|
||
if rects:
|
||
page_text = page.get_text()
|
||
import re as _re
|
||
if not _re.search(r"(?<![A-Za-zÀ-ÿ])" + _re.escape(token) + r"(?![A-Za-zÀ-ÿ])",
|
||
page_text, _re.IGNORECASE):
|
||
rects = []
|
||
if not rects:
|
||
for word in token.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 4 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
# Whitelist absolue sur sous-mots
|
||
if word.lower() in _WHITELIST_NEVER_MASK_TOKENS:
|
||
continue
|
||
rects.extend(_search_whole_word(page, word))
|
||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||
rects.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||
all_rects.extend(rects)
|
||
continue
|
||
rects = page.search_for(token)
|
||
if not rects and h.kind in {"NIR", "IBAN", "TEL"}:
|
||
compact = re.sub(r"\s+", "", token)
|
||
if compact != token:
|
||
rects = page.search_for(compact)
|
||
if not rects and ocr_word_map and pno in ocr_word_map:
|
||
rects = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
all_rects.extend(rects)
|
||
# Appliquer toutes les annotations d'un coup (évite de ralentir search_for)
|
||
for r in all_rects:
|
||
page.add_redact_annot(r, fill=(0, 0, 0))
|
||
try:
|
||
page.apply_redactions()
|
||
except Exception:
|
||
pass
|
||
doc.save(str(out_pdf), deflate=True, garbage=4, clean=True, incremental=False)
|
||
doc.close()
|
||
|
||
|
||
def _rasterize_page(args):
|
||
"""Worker parallèle : rasterise une page + dessine les rectangles noirs."""
|
||
pdf_path_str, pno, rects_tuples, dpi, ogc_label, jpeg_quality, image_rects_tuples = args
|
||
doc = fitz.open(pdf_path_str)
|
||
src = doc[pno]
|
||
rect_w, rect_h = src.rect.width, src.rect.height
|
||
zoom = dpi / 72.0
|
||
pix = src.get_pixmap(matrix=fitz.Matrix(zoom, zoom), annots=False)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
draw = ImageDraw.Draw(img)
|
||
shrink = 1.5
|
||
for (x0, y0, x1, y1) in rects_tuples:
|
||
rx0 = x0 * zoom + shrink
|
||
ry0 = y0 * zoom
|
||
rx1 = x1 * zoom - shrink
|
||
ry1 = y1 * zoom
|
||
if rx1 > rx0:
|
||
draw.rectangle([rx0, ry0, rx1, ry1], fill=(0, 0, 0))
|
||
# Noircir les images embarquées (logos, signatures, captures d'écran)
|
||
for (x0, y0, x1, y1) in image_rects_tuples:
|
||
rx0 = x0 * zoom
|
||
ry0 = y0 * zoom
|
||
rx1 = x1 * zoom
|
||
ry1 = y1 * zoom
|
||
draw.rectangle([rx0, ry0, rx1, ry1], fill=(0, 0, 0))
|
||
# Détecter et noircir les codes-barres et QR codes
|
||
# Filtre : ignorer les faux positifs pyzbar sur les tableaux
|
||
# (les grilles de cellules sont parfois interprétées comme des codes-barres)
|
||
_MIN_BARCODE_AREA = 2000 * zoom * zoom # minimum ~2000 px² en taille originale
|
||
try:
|
||
from pyzbar.pyzbar import decode as _pyzbar_decode
|
||
for symbol in _pyzbar_decode(img):
|
||
r = symbol.rect
|
||
area = r.width * r.height
|
||
if area < _MIN_BARCODE_AREA:
|
||
continue # Trop petit — probablement un FP sur une cellule de tableau
|
||
margin = int(5 * zoom)
|
||
draw.rectangle([r.left - margin, r.top - margin,
|
||
r.left + r.width + margin, r.top + r.height + margin],
|
||
fill=(0, 0, 0))
|
||
except Exception:
|
||
pass # pyzbar optionnel
|
||
if ogc_label:
|
||
from PIL import ImageFont
|
||
font_size = int(14 * zoom)
|
||
try:
|
||
font = ImageFont.truetype("/usr/share/fonts/truetype/dejavu/DejaVuSans-Bold.ttf", font_size)
|
||
except Exception:
|
||
font = ImageFont.load_default()
|
||
text = ogc_label if ogc_label.upper().startswith("OGC") else f"OGC: {ogc_label}"
|
||
bbox = draw.textbbox((0, 0), text, font=font)
|
||
tw, th = bbox[2] - bbox[0], bbox[3] - bbox[1]
|
||
margin = int(10 * zoom)
|
||
x = img.width - tw - margin
|
||
y = margin
|
||
draw.rectangle([x - 4, y - 2, x + tw + 4, y + th + 2], fill=(255, 255, 255))
|
||
draw.text((x, y), text, fill=(0, 0, 0), font=font)
|
||
buf = io.BytesIO()
|
||
if jpeg_quality and jpeg_quality > 0:
|
||
img.save(buf, format="JPEG", quality=jpeg_quality, optimize=True)
|
||
else:
|
||
img.save(buf, format="PNG")
|
||
doc.close()
|
||
return pno, buf.getvalue(), rect_w, rect_h
|
||
|
||
|
||
def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dpi: int = 120, ogc_label: Optional[str] = None, ocr_word_map: OcrWordMap = None, jpeg_quality: int = 80) -> None:
|
||
if fitz is None:
|
||
raise RuntimeError("PyMuPDF non disponible – installez pymupdf.")
|
||
doc = fitz.open(str(original_pdf))
|
||
all_rects: Dict[int, List["fitz.Rect"]] = {}
|
||
_RASTER_SKIP_KINDS = {"EDS_DATE", "EDS_DATE_NAISSANCE", "EDS_SECU", "EDS_TEL"}
|
||
# Kinds sensibles au substring matching : utiliser _search_whole_word
|
||
_RASTER_WHOLEWORD_KINDS = {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "EDS_NOM", "EDS_PRENOM",
|
||
"EDS_HOPITAL", "EDS_VILLE", "ETAB", "ETAB_GLOBAL",
|
||
"ETAB_FINESS", "ADDR_FINESS", "NER_PER", "NER_ORG", "NER_LOC"}
|
||
_VLM_NUMERIC_KINDS = {"VLM_NUM_PATIENT", "VLM_NUM_LOT", "VLM_NUM_ORD", "VLM_NDA",
|
||
"VLM_NIR", "VLM_IPP", "VLM_RPPS"}
|
||
by_page: Dict[int, List[PiiHit]] = {}
|
||
for h in audit:
|
||
by_page.setdefault(h.page, []).append(h)
|
||
for pno in range(len(doc)):
|
||
page = doc[pno]
|
||
rects = []
|
||
seen_tokens: set = set()
|
||
hits = by_page.get(pno, []) + by_page.get(-1, [])
|
||
# Masquage total si FULL_PAGE_MASK détecté (page manuscrite non déchiffrable)
|
||
if any(h.kind == "FULL_PAGE_MASK" and h.page == pno for h in hits):
|
||
margin = 5 # points — liseré fin autour du masque
|
||
rects.append(fitz.Rect(margin, margin, page.rect.width - margin, page.rect.height - margin))
|
||
all_rects[pno] = rects
|
||
continue
|
||
for h in hits:
|
||
token = h.original.strip()
|
||
if not token or h.kind in _RASTER_SKIP_KINDS:
|
||
continue
|
||
# Ignorer les tokens trop courts (initiales isolées, "N", "S", "de")
|
||
# qui génèrent des FP en matchant dans les labels DPI
|
||
if len(token) < 3 and h.kind in _RASTER_WHOLEWORD_KINDS:
|
||
continue
|
||
if token in seen_tokens:
|
||
continue
|
||
seen_tokens.add(token)
|
||
# --- Kinds de type nom/entité : whole-word search pour éviter le
|
||
# substring matching (ex: "TATIN" dans "ATORVASTATINE") ---
|
||
if h.kind in _RASTER_WHOLEWORD_KINDS or h.kind == "NOM_FORCE":
|
||
# NOM_FORCE bypass le filtre stop-words (nom confirmé par contexte Dr/Mme)
|
||
if h.kind != "NOM_FORCE" and token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
if " " not in token:
|
||
# Token mono-mot : chercher comme mot entier
|
||
found_ww = _search_whole_word(page, token)
|
||
if not found_ww and ocr_word_map and pno in ocr_word_map:
|
||
found_ww = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
rects.extend(found_ww)
|
||
else:
|
||
# Token multi-mots : chercher la chaîne complète puis vérifier
|
||
# les frontières de mots pour éviter le substring matching
|
||
# (ex: "TATINE EG" trouvé dans "ATORVASTATINE EG")
|
||
found_multi = page.search_for(token)
|
||
if found_multi:
|
||
# Vérifier que le match est sur des frontières de mots
|
||
page_text = page.get_text()
|
||
verified = []
|
||
for rect in found_multi:
|
||
# Extraire le texte autour du match pour vérifier les limites
|
||
# Chercher le token dans le texte brut avec \b
|
||
import re as _re
|
||
if _re.search(r"(?<![A-Za-zÀ-ÿ])" + _re.escape(token) + r"(?![A-Za-zÀ-ÿ])",
|
||
page_text, _re.IGNORECASE):
|
||
verified.append(rect)
|
||
found_multi = verified
|
||
if not found_multi:
|
||
# Fallback : chercher chaque mot comme mot entier
|
||
for word in token.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 4 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
found_multi.extend(_search_whole_word(page, word))
|
||
if not found_multi and ocr_word_map and pno in ocr_word_map:
|
||
found_multi.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||
rects.extend(found_multi)
|
||
continue
|
||
found = page.search_for(token)
|
||
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
||
compact = re.sub(r"\s+", "", token)
|
||
found = page.search_for(compact)
|
||
if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
||
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
|
||
for word in token.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 3 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
found.extend(page.search_for(word))
|
||
# Fallback OCR pour chaque mot
|
||
if not found and ocr_word_map and pno in ocr_word_map:
|
||
found.extend(_search_ocr_words(ocr_word_map[pno], word, page.rect))
|
||
if not found and ocr_word_map and pno in ocr_word_map:
|
||
found = _search_ocr_words(ocr_word_map[pno], token, page.rect)
|
||
# Matching flou pour identifiants numériques VLM (manuscrit)
|
||
if not found and h.kind in _VLM_NUMERIC_KINDS and ocr_word_map and pno in ocr_word_map:
|
||
found = _search_ocr_words_fuzzy_digits(ocr_word_map[pno], token, page.rect)
|
||
rects.extend(found)
|
||
all_rects[pno] = rects
|
||
|
||
# Phase 2 : rasterisation parallèle (ProcessPoolExecutor)
|
||
n_pages = len(doc)
|
||
rects_as_tuples = {
|
||
pno: [(r.x0, r.y0, r.x1, r.y1) for r in rects]
|
||
for pno, rects in all_rects.items()
|
||
}
|
||
# Extraire les positions des images embarquées (logos, captures d'écran, etc.)
|
||
# Filtre : images > 30x30 points (ignorer icônes/puces)
|
||
# MAIS exclure les images pleine page (> 70% surface page) = documents scannés
|
||
_MIN_IMG_DIM = 30
|
||
_MAX_PAGE_COVERAGE = 0.70 # au-delà = image du document scanné, ne pas noircir
|
||
image_rects_by_page: Dict[int, list] = {}
|
||
for pno in range(n_pages):
|
||
page = doc[pno]
|
||
page_area = page.rect.width * page.rect.height
|
||
img_rects = []
|
||
for img_item in page.get_images(full=True):
|
||
xref = img_item[0]
|
||
try:
|
||
for r in page.get_image_rects(xref):
|
||
if r.is_empty or r.is_infinite:
|
||
continue
|
||
if r.width >= _MIN_IMG_DIM and r.height >= _MIN_IMG_DIM:
|
||
# Exclure les images pleine page (document scanné)
|
||
img_area = r.width * r.height
|
||
if page_area > 0 and img_area / page_area > _MAX_PAGE_COVERAGE:
|
||
continue
|
||
img_rects.append((r.x0, r.y0, r.x1, r.y1))
|
||
except Exception:
|
||
continue
|
||
image_rects_by_page[pno] = img_rects
|
||
doc.close() # fermer AVANT le fork
|
||
|
||
tasks = [
|
||
(str(original_pdf), pno, rects_as_tuples.get(pno, []), dpi, ogc_label, jpeg_quality,
|
||
image_rects_by_page.get(pno, []))
|
||
for pno in range(n_pages)
|
||
]
|
||
|
||
# Mode frozen (PyInstaller --onefile) : ProcessPoolExecutor relance l'exe
|
||
# et ouvre des fenêtres GUI fantômes → séquentiel obligatoire
|
||
if getattr(sys, 'frozen', False) or n_pages <= 2:
|
||
results = sorted([_rasterize_page(t) for t in tasks], key=lambda x: x[0])
|
||
else:
|
||
n_workers = min(n_pages, os.cpu_count() or 4)
|
||
with ProcessPoolExecutor(max_workers=n_workers) as pool:
|
||
results = sorted(pool.map(_rasterize_page, tasks), key=lambda x: x[0])
|
||
|
||
# Assemblage final (séquentiel, rapide)
|
||
out = fitz.open()
|
||
for pno, img_bytes, w, h in results:
|
||
dst = out.new_page(width=w, height=h)
|
||
dst.insert_image(fitz.Rect(0, 0, w, h), stream=img_bytes)
|
||
out.save(str(out_pdf), deflate=True, garbage=4, clean=True)
|
||
out.close()
|
||
|
||
# ----------------- VLM pour PDFs scannés -----------------
|
||
|
||
def _apply_vlm_on_scanned_pdf(pdf_path: Path, anon: AnonResult, ocr_word_map: OcrWordMap, vlm_manager) -> None:
|
||
"""Utilise un VLM (Ollama) pour détecter visuellement les PII sur chaque page d'un PDF scanné.
|
||
Les entités détectées sont ajoutées à anon.audit et au texte pseudonymisé.
|
||
Auto-rotation : si une page a peu de mots OCR, essaie 4 orientations."""
|
||
from vlm_manager import VLM_CATEGORY_MAP
|
||
doc = fitz.open(str(pdf_path))
|
||
# Collecter les PII déjà détectés pour contexte VLM
|
||
existing_pii = list({h.original.strip() for h in anon.audit if h.original.strip()})
|
||
|
||
# Catégories contenant des identifiants numériques (matching flou)
|
||
_NUMERIC_CATS = {"NUMERO_PATIENT", "NUMERO_LOT", "NUMERO_ORDONNANCE", "NUMERO_SEJOUR",
|
||
"NDA", "NIR", "IPP", "RPPS"}
|
||
# Catégories à splitter en mots (noms, services, établissements)
|
||
_SPLIT_CATS = {"NOM", "PRENOM", "ETABLISSEMENT", "SERVICE"}
|
||
|
||
for pno in range(len(doc)):
|
||
pix = doc[pno].get_pixmap(dpi=150)
|
||
img = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
|
||
|
||
# Détection page manuscrite : peu de mots OCR = scan manuscrit / mal orienté
|
||
ocr_count = len(ocr_word_map.get(pno, []))
|
||
is_handwritten_page = ocr_count < 100
|
||
|
||
# Pages manuscrites : masquage total direct (VLM trop lent/hallucinatoire)
|
||
if is_handwritten_page and ocr_count > 0:
|
||
anon.audit.append(PiiHit(page=pno, kind="FULL_PAGE_MASK", original="page manuscrite",
|
||
placeholder=PLACEHOLDERS["MASK"]))
|
||
log.info("VLM page %d : masquage total direct (OCR=%d mots)", pno, ocr_count)
|
||
continue
|
||
|
||
# Pages lisibles : analyse VLM
|
||
best_entities = []
|
||
try:
|
||
best_entities = vlm_manager.analyze_page_image(img, page_number=pno,
|
||
existing_pii=existing_pii[:20])
|
||
except Exception:
|
||
best_entities = []
|
||
|
||
for ent in best_entities:
|
||
cat = ent.get("categorie", "").upper()
|
||
texte = ent.get("texte", "").strip()
|
||
conf = ent.get("confiance", 0.0)
|
||
if not texte or conf < 0.3:
|
||
continue
|
||
if cat not in VLM_CATEGORY_MAP:
|
||
continue
|
||
kind, placeholder_key = VLM_CATEGORY_MAP[cat]
|
||
placeholder = PLACEHOLDERS.get(placeholder_key, PLACEHOLDERS["MASK"])
|
||
|
||
if cat in _SPLIT_CATS:
|
||
# Splitter en mots pour meilleur matching OCR
|
||
for word in texte.split():
|
||
word = word.strip(" .-'(),")
|
||
if len(word) < 2 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
anon.audit.append(PiiHit(page=pno, kind=kind, original=word, placeholder=placeholder))
|
||
else:
|
||
anon.audit.append(PiiHit(page=pno, kind=kind, original=texte, placeholder=placeholder))
|
||
# Pour les identifiants numériques, ajouter aussi le token nettoyé (chiffres seuls)
|
||
if cat in _NUMERIC_CATS:
|
||
digits_only = re.sub(r"[^0-9]", "", texte)
|
||
if digits_only and digits_only != texte:
|
||
anon.audit.append(PiiHit(page=pno, kind=kind, original=digits_only, placeholder=placeholder))
|
||
|
||
# Remplacer dans le texte pseudonymisé si trouvé
|
||
try:
|
||
anon.text_out = re.sub(rf"\b{re.escape(texte)}\b", placeholder, anon.text_out)
|
||
except re.error:
|
||
anon.text_out = anon.text_out.replace(texte, placeholder)
|
||
|
||
doc.close()
|
||
|
||
|
||
# ----------------- Orchestration -----------------
|
||
|
||
def process_pdf(
|
||
pdf_path: Path,
|
||
out_dir: Path,
|
||
make_vector_redaction: bool = True,
|
||
also_make_raster_burn: bool = False,
|
||
config_path: Optional[Path] = None,
|
||
use_hf: bool = False,
|
||
ner_manager=None,
|
||
ner_thresholds=None,
|
||
ogc_label: Optional[str] = None,
|
||
vlm_manager=None,
|
||
gliner_manager=None,
|
||
camembert_manager=None,
|
||
) -> Dict[str, str]:
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
cfg = load_dictionaries(config_path)
|
||
pages_text, tables_lines, ocr_used, ocr_word_map = extract_text_with_fallback_ocr(pdf_path)
|
||
|
||
# 1) Regex rules + NER-first cross-validation
|
||
# Passer les NER managers pour que anonymise_document_regex exécute le NER
|
||
# sur le texte original (non masqué) et valide les noms extraits par regex
|
||
_eds_mgr_for_regex = ner_manager if (EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager)) else None
|
||
anon = anonymise_document_regex(
|
||
pages_text, tables_lines, cfg,
|
||
eds_pseudo_mgr=_eds_mgr_for_regex,
|
||
gliner_mgr=gliner_manager,
|
||
camembert_mgr=camembert_manager,
|
||
)
|
||
|
||
# 1b) VLM (optionnel) — sur les PDFs scannés uniquement
|
||
if ocr_used and vlm_manager is not None and VlmManager is not None:
|
||
try:
|
||
if vlm_manager.is_loaded():
|
||
_apply_vlm_on_scanned_pdf(pdf_path, anon, ocr_word_map, vlm_manager)
|
||
except Exception:
|
||
pass # dégradation gracieuse
|
||
|
||
# 2) NER (optionnel) — sur le narratif
|
||
final_text = anon.text_out
|
||
hf_hits: List[PiiHit] = []
|
||
if use_hf and ner_manager is not None and ner_manager.is_loaded():
|
||
# Détecter le type de manager et appeler la bonne fonction
|
||
if EdsPseudoManager is not None and isinstance(ner_manager, EdsPseudoManager):
|
||
final_text, hf_hits = apply_eds_pseudo_on_narrative(final_text, cfg, ner_manager, gliner_mgr=gliner_manager, camembert_mgr=camembert_manager)
|
||
else:
|
||
final_text, hf_hits = apply_hf_ner_on_narrative(final_text, cfg, ner_manager, ner_thresholds)
|
||
anon.audit.extend(hf_hits)
|
||
|
||
# 3) Rescan selectif
|
||
final_text = selective_rescan(final_text, cfg=cfg)
|
||
|
||
# 3b) Nettoyage post-masquage : codes postaux orphelins (5 chiffres collés à un placeholder)
|
||
# et téléphones fragmentés sur plusieurs lignes
|
||
_re_cp_orphan = re.compile(r"(\[(?:ADRESSE|NOM|VILLE)\])\s*(\d{5})\b")
|
||
def _clean_cp_orphan(m):
|
||
anon.audit.append(PiiHit(-1, "CODE_POSTAL", m.group(2), PLACEHOLDERS["CODE_POSTAL"]))
|
||
return m.group(1) + PLACEHOLDERS["CODE_POSTAL"]
|
||
final_text = _re_cp_orphan.sub(_clean_cp_orphan, final_text)
|
||
|
||
# Téléphones fragmentés : "0X XX XX XX\nXX" coupé en fin de ligne (ligne suivante immédiate)
|
||
_re_tel_frag = re.compile(r"((?:\+33\s?|0)\d(?:[ .-]?\d){6,7})\s*\n\s*(\d{2}(?!\d))")
|
||
def _clean_tel_frag(m):
|
||
full = m.group(1).replace(" ", "").replace(".", "").replace("-", "") + m.group(2)
|
||
if len(full.replace("+33", "0")) == 10:
|
||
anon.audit.append(PiiHit(-1, "TEL", m.group(0).strip(), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"] + "\n"
|
||
return m.group(0)
|
||
final_text = _re_tel_frag.sub(_clean_tel_frag, final_text)
|
||
|
||
# Téléphones incomplets en fin de ligne (8 ou 9 chiffres au format 0X XX XX XX) : masquer la partie visible
|
||
_re_tel_partial = re.compile(r"(?<!\d)((?:\+33\s?|0)\d(?:[ .-]?\d){5,7})(?!\d)\s*$", re.MULTILINE)
|
||
def _clean_tel_partial(m):
|
||
digits = re.sub(r"[ .\-]", "", m.group(1))
|
||
if 8 <= len(digits) <= 9:
|
||
anon.audit.append(PiiHit(-1, "TEL", m.group(0).strip(), PLACEHOLDERS["TEL"]))
|
||
return PLACEHOLDERS["TEL"]
|
||
return m.group(0)
|
||
final_text = _re_tel_partial.sub(_clean_tel_partial, final_text)
|
||
|
||
# 3c) Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
|
||
_RE_INITIAL_BEFORE_NOM = re.compile(
|
||
r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.[\s\-]*(\[NOM\])'
|
||
)
|
||
def _clean_initial_before_nom(m):
|
||
anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(1) + ".", PLACEHOLDERS["NOM"]))
|
||
return PLACEHOLDERS["NOM"] + " " + m.group(2)
|
||
final_text = _RE_INITIAL_BEFORE_NOM.sub(_clean_initial_before_nom, final_text)
|
||
|
||
# 3d) Références initiales : "Ref : JF/VA", "Réf : AD/EP" → "Ref : [NOM]/[NOM]"
|
||
_RE_REF_INITIALS = re.compile(
|
||
r'(?:Ref\s*:\s*|Réf\s*:\s*)([A-Z]{1,3})\s*/\s*([A-Z]{1,3})\b'
|
||
)
|
||
def _clean_ref_initials(m):
|
||
anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(1), PLACEHOLDERS["NOM"]))
|
||
anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(2), PLACEHOLDERS["NOM"]))
|
||
prefix = m.group(0)[:m.group(0).index(m.group(1))]
|
||
return prefix + PLACEHOLDERS["NOM"] + "/" + PLACEHOLDERS["NOM"]
|
||
final_text = _RE_REF_INITIALS.sub(_clean_ref_initials, final_text)
|
||
|
||
# 4) Consolidation : propager les PII détectés sur toutes les pages (page=-1)
|
||
# pour que la redaction PDF les cherche partout (sidebar répété, etc.)
|
||
|
||
# 4a) Noms : extraire les tokens individuels
|
||
_nom_kinds = {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}
|
||
_global_name_tokens: set = set()
|
||
for h in anon.audit:
|
||
if h.kind not in _nom_kinds:
|
||
continue
|
||
for word in h.original.split():
|
||
word = word.strip(" .-'")
|
||
if len(word) < 3:
|
||
continue
|
||
if word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||
continue
|
||
if word.lower() in _DPI_LABELS_SET:
|
||
continue
|
||
if not word[0].isupper():
|
||
continue
|
||
_global_name_tokens.add(word)
|
||
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
|
||
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
|
||
raw_full = "\n\n".join(pages_text)
|
||
_companion_tokens: set = set()
|
||
for token in _global_name_tokens:
|
||
# Token connu suivi d'un mot ALL-CAPS
|
||
for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\b", raw_full):
|
||
candidate = m.group(1)
|
||
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||
and candidate not in _global_name_tokens
|
||
and candidate not in _COMPANION_BLACKLIST_SET):
|
||
_companion_tokens.add(candidate)
|
||
# Mot ALL-CAPS suivi du token connu
|
||
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\s+{re.escape(token)}\b", raw_full):
|
||
candidate = m.group(1)
|
||
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||
and candidate not in _global_name_tokens
|
||
and candidate not in _COMPANION_BLACKLIST_SET):
|
||
_companion_tokens.add(candidate)
|
||
_global_name_tokens.update(_companion_tokens)
|
||
|
||
# Retirer les sous-parties COURTES de noms composés (JEAN si JEAN-PIERRE existe)
|
||
# Garder les parties longues (>=5 chars) car le texte peut les scinder sur des lignes séparées
|
||
_compound = {t for t in _global_name_tokens if "-" in t}
|
||
_parts_to_drop = set()
|
||
for comp in _compound:
|
||
for part in comp.split("-"):
|
||
part = part.strip()
|
||
if len(part) >= 2 and len(part) < 5 and part in _global_name_tokens:
|
||
_parts_to_drop.add(part)
|
||
_global_name_tokens -= _parts_to_drop
|
||
|
||
# 4a-ter) Filtrage final des tokens globaux : rejeter les mots qui ne ressemblent pas à des noms propres
|
||
# - Mots courants français (minuscule initiale déjà filtrés en amont)
|
||
# - ALL-CAPS <= 4 chars confirmés par une seule source seulement
|
||
_nom_kind_counts: Dict[str, set] = {}
|
||
for h in anon.audit:
|
||
if h.kind in _nom_kinds:
|
||
for word in h.original.split():
|
||
word = word.strip(" .-'")
|
||
if word:
|
||
_nom_kind_counts.setdefault(word, set()).add(h.kind)
|
||
_filtered_global: set = set()
|
||
for token in _global_name_tokens:
|
||
# ALL-CAPS court (<=4) avec une seule source → probablement une abréviation
|
||
if token.isupper() and len(token) <= 4 and len(_nom_kind_counts.get(token, set())) < 2:
|
||
continue
|
||
_filtered_global.add(token)
|
||
_global_name_tokens = _filtered_global
|
||
|
||
# DÉSACTIVÉ: NOM_GLOBAL génère 670 FP avec 0 TP (100% faux positifs)
|
||
# La propagation globale des noms est trop agressive
|
||
# for token in _global_name_tokens:
|
||
# anon.audit.append(PiiHit(page=-1, kind="NOM_GLOBAL", original=token, placeholder=PLACEHOLDERS["NOM"]))
|
||
|
||
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
|
||
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
|
||
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
|
||
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER", "NDA", "EPISODE"}
|
||
|
||
_global_pii: Dict[str, set] = {}
|
||
for h in anon.audit:
|
||
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
|
||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER", "NDA",
|
||
"force_term", "force_regex", "FINESS"}:
|
||
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
|
||
if h.kind == "DATE_NAISSANCE":
|
||
# Extraire la date pure (DD/MM/YYYY ou DD/MM/YY)
|
||
date_match = re.search(r'(\d{1,2})[/.\-\s]+(\d{1,2})[/.\-\s]+(\d{2,4})', h.original)
|
||
if date_match:
|
||
day, month, year = date_match.groups()
|
||
# Normaliser les composants (ajouter zéro si nécessaire)
|
||
day = day.zfill(2)
|
||
month = month.zfill(2)
|
||
# Générer toutes les variations de séparateurs
|
||
date_variations = [
|
||
f"{day}/{month}/{year}",
|
||
f"{day}.{month}.{year}",
|
||
f"{day}-{month}-{year}",
|
||
f"{day} {month} {year}",
|
||
]
|
||
for var in date_variations:
|
||
_global_pii.setdefault(h.kind, set()).add(var)
|
||
else:
|
||
# Fallback : ajouter tel quel si pas de match
|
||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||
else:
|
||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||
|
||
# Propager UNIQUEMENT les PII critiques (évite les 951 FP des autres types)
|
||
for kind, values in _global_pii.items():
|
||
if kind not in _CRITICAL_PII_TYPES:
|
||
continue # Skip non-critical PII (TEL, ADRESSE, etc.)
|
||
|
||
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
||
for val in values:
|
||
if not val or len(val) < 3: # Skip valeurs trop courtes
|
||
continue
|
||
anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder))
|
||
|
||
log.info("Propagation globale sélective : %d types critiques propagés",
|
||
sum(1 for k in _global_pii.keys() if k in _CRITICAL_PII_TYPES))
|
||
|
||
# 4e) Appliquer les tokens globaux sur le texte pseudonymisé
|
||
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL"}
|
||
for h in anon.audit:
|
||
if h.page != -1:
|
||
continue
|
||
if not (h.kind == "NOM_GLOBAL" or h.kind.endswith("_GLOBAL")):
|
||
continue
|
||
if h.kind in _GLOBAL_SKIP_KINDS:
|
||
continue
|
||
token = h.original.strip()
|
||
if not token or len(token) < 4:
|
||
continue
|
||
# Garde trackare : NOM_GLOBAL très court (<=4) risque de masquer des codes diagnostics/acronymes
|
||
if anon.is_trackare and h.kind == "NOM_GLOBAL" and len(token) <= 4:
|
||
continue
|
||
|
||
try:
|
||
# Traitement spécial pour DATE_NAISSANCE_GLOBAL : gérer les variations de format et contexte
|
||
if h.kind == "DATE_NAISSANCE_GLOBAL":
|
||
# Extraire les composants de la date (DD/MM/YYYY ou variations)
|
||
date_match = re.search(r'(\d{1,2})[/.\-\s]+(\d{1,2})[/.\-\s]+(\d{2,4})', token)
|
||
if date_match:
|
||
day, month, year = date_match.groups()
|
||
# Pattern flexible qui accepte tous les séparateurs
|
||
# [\s/.\-]+ accepte : espace, slash, point, tiret (un ou plusieurs)
|
||
date_pattern = rf'{day}[\s/.\-]+{month}[\s/.\-]+{year}'
|
||
|
||
# Multi-pass replacement pour couvrir tous les cas
|
||
# Pass 1 : Avec contexte "Né(e) le" (case-insensitive)
|
||
final_text = re.sub(
|
||
rf'Né(?:e)?\s+le\s+{date_pattern}',
|
||
h.placeholder,
|
||
final_text,
|
||
flags=re.IGNORECASE
|
||
)
|
||
# Pass 2 : Sans contexte (date seule)
|
||
final_text = re.sub(
|
||
rf'\b{date_pattern}\b',
|
||
h.placeholder,
|
||
final_text,
|
||
flags=re.IGNORECASE
|
||
)
|
||
continue
|
||
|
||
# Traitement spécial pour force_term : remplacement case-insensitive avec word boundaries
|
||
if h.kind == "force_term_GLOBAL":
|
||
# Échapper les caractères spéciaux mais garder la flexibilité
|
||
pat = re.escape(token)
|
||
final_text = re.sub(rf'\b{pat}\b', h.placeholder, final_text, flags=re.IGNORECASE)
|
||
continue
|
||
|
||
# Traitement standard pour les autres types
|
||
pat = re.escape(token)
|
||
# Noms composés : tolérer les sauts de ligne/espaces autour du tiret
|
||
if "-" in token:
|
||
pat = pat.replace(r"\-", r"\-\s*")
|
||
# Dates : tolérer variations de séparateurs
|
||
if "/" in token or "." in token:
|
||
pat = pat.replace(r"\.", r"[\s/.\-]").replace(r"\/", r"[\s/.\-]")
|
||
|
||
final_text = re.sub(rf"\b{pat}\b", h.placeholder, final_text, flags=re.IGNORECASE)
|
||
except re.error:
|
||
final_text = final_text.replace(token, h.placeholder)
|
||
|
||
# Log OCR dans l'audit
|
||
if ocr_used:
|
||
anon.audit.insert(0, PiiHit(page=-1, kind="OCR_USED", original="docTR", placeholder=""))
|
||
|
||
# Filtrer les faux positifs hospitaliers
|
||
if _HOSPITAL_FILTER_AVAILABLE:
|
||
try:
|
||
hospital_filter = HospitalFilter()
|
||
original_count = len(anon.audit)
|
||
|
||
# Convertir les PiiHit en format dict pour le filtre
|
||
detections = [
|
||
{
|
||
'kind': hit.kind,
|
||
'original': hit.original,
|
||
'page': hit.page
|
||
}
|
||
for hit in anon.audit
|
||
]
|
||
|
||
# Filtrer (passer le flag is_trackare)
|
||
filtered_detections = hospital_filter.filter_detections(
|
||
detections,
|
||
pdf_path.name,
|
||
is_trackare=anon.is_trackare
|
||
)
|
||
|
||
# Reconstruire la liste anon.audit
|
||
filtered_audit = []
|
||
for det in filtered_detections:
|
||
# Trouver le PiiHit original correspondant
|
||
for hit in anon.audit:
|
||
if (hit.kind == det['kind'] and
|
||
hit.original == det['original'] and
|
||
hit.page == det['page']):
|
||
filtered_audit.append(hit)
|
||
break
|
||
|
||
anon.audit = filtered_audit
|
||
filtered_count = original_count - len(anon.audit)
|
||
|
||
if filtered_count > 0:
|
||
log.info("Filtre hospitalier : %d faux positifs éliminés", filtered_count)
|
||
except Exception as e:
|
||
log.warning("Erreur lors du filtrage hospitalier : %s", e)
|
||
|
||
# Sécurité : supprimer tout bloc [TABLES] résiduel (ne devrait plus arriver)
|
||
final_text = re.sub(r"\n*\[TABLES\].*?\[/TABLES\]\n*", "\n", final_text, flags=re.DOTALL)
|
||
|
||
# Nettoyage crochets doubles : [[PLACEHOLDER]] → [PLACEHOLDER] (artefact quand
|
||
# le PDF original avait déjà des crochets autour de la valeur masquée)
|
||
_RE_BRACKET_CLEAN = re.compile(
|
||
r"\[+(\[(?:NOM|TEL|EMAIL|VILLE|ADRESSE|CODE_POSTAL|FINESS|ETABLISSEMENT|MASK|IPP|"
|
||
r"DOSSIER|NDA|EPISODE|RPPS|DATE_NAISSANCE|AGE|NIR|IBAN|OGC)\])\]+"
|
||
)
|
||
final_text = _RE_BRACKET_CLEAN.sub(r"\1", final_text)
|
||
|
||
# 6) Whitelist absolue : filtrer les hits qui matchent un terme whitelist
|
||
# de la GUI (clé YAML whitelist_phrases). Filet de sécurité après tous les
|
||
# mécanismes de détection — empêche DUPONT (whitelist) d'être masqué dans
|
||
# le PDF même s'il a été ajouté à l'audit par regex/NER/cross-validation.
|
||
if _WHITELIST_NEVER_MASK_TOKENS:
|
||
_NAME_LIKE_KINDS = {
|
||
"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NOM_FORCE", "NOM_INITIAL",
|
||
"EDS_NOM", "EDS_PRENOM", "EDS_HOPITAL", "EDS_VILLE",
|
||
"ETAB", "ETAB_GLOBAL", "ETAB_FINESS", "ADDR_FINESS",
|
||
"NER_PER", "NER_ORG", "NER_LOC",
|
||
"VILLE", "force_term", "force_term_GLOBAL",
|
||
}
|
||
before = len(anon.audit)
|
||
kept = []
|
||
removed_tokens: set = set()
|
||
for h in anon.audit:
|
||
if h.kind not in _NAME_LIKE_KINDS:
|
||
kept.append(h); continue
|
||
tok = (h.original or "").strip()
|
||
if not tok:
|
||
kept.append(h); continue
|
||
tok_lower = tok.lower()
|
||
# Phrase complète whitelist → retirer
|
||
if tok_lower in _WHITELIST_NEVER_MASK_PHRASES:
|
||
removed_tokens.add(tok); continue
|
||
# Au moins un sous-token whitelist → retirer le hit (les sous-tokens
|
||
# non-whitelist sont déjà couverts par d'autres hits si nécessaire)
|
||
sub = [s for s in re.split(r"[\s\-']+", tok_lower) if s]
|
||
if any(s in _WHITELIST_NEVER_MASK_TOKENS for s in sub):
|
||
removed_tokens.add(tok); continue
|
||
kept.append(h)
|
||
anon.audit = kept
|
||
if before != len(anon.audit):
|
||
log.info("Whitelist : %d hit(s) filtré(s) (%s)",
|
||
before - len(anon.audit),
|
||
", ".join(sorted(removed_tokens)[:10]))
|
||
|
||
# Sauvegardes
|
||
base = pdf_path.stem
|
||
txt_path = out_dir / f"{base}.pseudonymise.txt"
|
||
audit_path = out_dir / f"{base}.audit.jsonl"
|
||
txt_path.write_text(final_text, encoding="utf-8")
|
||
|
||
# Filtrer les entrées de propagation globale (page=-1) avant d'écrire l'audit
|
||
# Ces entrées sont utilisées pour le remplacement dans le texte mais ne sont pas des détections réelles
|
||
audit_for_file = [hit for hit in anon.audit if hit.page != -1]
|
||
|
||
with audit_path.open("w", encoding="utf-8") as f:
|
||
for hit in audit_for_file:
|
||
f.write(json.dumps(hit.__dict__, ensure_ascii=False) + "\n")
|
||
outputs = {"text": str(txt_path), "audit": str(audit_path)}
|
||
|
||
# PDFs
|
||
if make_vector_redaction and fitz is not None:
|
||
vec_path = out_dir / f"{base}.redacted_vector.pdf"
|
||
try:
|
||
redact_pdf_vector(pdf_path, anon.audit, vec_path, ocr_word_map=ocr_word_map)
|
||
outputs["pdf_vector"] = str(vec_path)
|
||
except Exception:
|
||
pass
|
||
if also_make_raster_burn and fitz is not None:
|
||
ras_path = out_dir / f"{base}.redacted_raster.pdf"
|
||
redact_pdf_raster(pdf_path, anon.audit, ras_path, ogc_label=ogc_label, ocr_word_map=ocr_word_map)
|
||
outputs["pdf_raster"] = str(ras_path)
|
||
return outputs
|
||
|
||
|
||
def process_pdfs_batch(
|
||
pdf_paths: List[Path],
|
||
out_dir: Path,
|
||
max_workers: int = None,
|
||
**kwargs,
|
||
) -> List[Dict[str, str]]:
|
||
"""Traite plusieurs PDFs en parallèle (ProcessPoolExecutor).
|
||
|
||
Ne fonctionne que quand ner_manager=None (les modèles NER ne sont pas
|
||
picklables). Quand NER est actif, les PDFs restent séquentiels mais
|
||
bénéficient de la parallélisation page-level de redact_pdf_raster().
|
||
"""
|
||
if not pdf_paths:
|
||
return []
|
||
if max_workers is None:
|
||
max_workers = min(len(pdf_paths), os.cpu_count() or 4)
|
||
out_dir.mkdir(parents=True, exist_ok=True)
|
||
|
||
def _one(pdf_path):
|
||
return process_pdf(pdf_path, out_dir, **kwargs)
|
||
|
||
with ProcessPoolExecutor(max_workers=max_workers) as pool:
|
||
return list(pool.map(_one, pdf_paths))
|
||
|
||
|
||
if __name__ == "__main__":
|
||
import argparse
|
||
ap = argparse.ArgumentParser(description="Anonymiser PDF (regex + NER ONNX optionnel)")
|
||
ap.add_argument("pdf", type=str)
|
||
ap.add_argument("--out", type=str, default="out")
|
||
ap.add_argument("--no-vector", action="store_true")
|
||
ap.add_argument("--raster", action="store_true")
|
||
ap.add_argument("--config", type=str, default=str(Path("config/dictionnaires.yml")))
|
||
ap.add_argument("--hf", action="store_true", help="Activer NER ONNX sur narratif (nécessite ner_manager_onnx)")
|
||
ap.add_argument("--model", type=str, default="cmarkea/distilcamembert-base-ner")
|
||
args = ap.parse_args()
|
||
manager = None
|
||
if args.hf and NerModelManager is not None:
|
||
manager = NerModelManager(cache_dir=Path("models"))
|
||
manager.load(args.model)
|
||
outs = process_pdf(
|
||
Path(args.pdf),
|
||
Path(args.out),
|
||
make_vector_redaction=not args.no_vector,
|
||
also_make_raster_burn=args.raster,
|
||
config_path=Path(args.config),
|
||
use_hf=bool(args.hf),
|
||
ner_manager=manager,
|
||
ner_thresholds=NerThresholds() if NerThresholds else None,
|
||
)
|
||
print(json.dumps(outs, indent=2, ensure_ascii=False))
|
||
|
||
|
||
# ---------------------------------------------------------------------------
|
||
# process_document : wrapper multi-formats (PDF, DOCX, images, etc.)
|
||
# ---------------------------------------------------------------------------
|
||
|
||
def process_document(
|
||
doc_path: Path,
|
||
out_dir: Path,
|
||
**kwargs,
|
||
) -> Dict[str, str]:
|
||
"""Anonymise un document de n'importe quel format supporté.
|
||
|
||
Convertit en PDF si nécessaire, puis passe dans process_pdf().
|
||
Les formats supportés sont définis dans format_converter.SUPPORTED_EXTENSIONS.
|
||
|
||
Args:
|
||
doc_path: chemin du document source (PDF, DOCX, image, etc.)
|
||
out_dir: répertoire de sortie
|
||
**kwargs: arguments passés à process_pdf()
|
||
|
||
Returns:
|
||
dict avec les chemins des fichiers de sortie
|
||
"""
|
||
from format_converter import convert_to_pdf, SUPPORTED_EXTENSIONS
|
||
|
||
suffix = doc_path.suffix.lower()
|
||
if suffix not in SUPPORTED_EXTENSIONS:
|
||
raise ValueError(
|
||
f"Format '{suffix}' non supporté. "
|
||
f"Formats acceptés : {', '.join(sorted(SUPPORTED_EXTENSIONS))}"
|
||
)
|
||
|
||
pdf_path, is_temp = convert_to_pdf(doc_path)
|
||
try:
|
||
outputs = process_pdf(pdf_path=pdf_path, out_dir=out_dir, **kwargs)
|
||
|
||
# Renommer les sorties pour refléter le nom original (pas le .tmp_convert.pdf)
|
||
if is_temp:
|
||
original_stem = doc_path.stem
|
||
renamed = {}
|
||
for key, path_str in outputs.items():
|
||
p = Path(path_str)
|
||
if p.exists() and ".tmp_convert" in p.name:
|
||
new_name = p.name.replace(doc_path.stem + ".tmp_convert", original_stem)
|
||
new_path = p.parent / new_name
|
||
p.rename(new_path)
|
||
renamed[key] = str(new_path)
|
||
else:
|
||
renamed[key] = path_str
|
||
outputs = renamed
|
||
|
||
return outputs
|
||
finally:
|
||
if is_temp and pdf_path.exists():
|
||
pdf_path.unlink()
|