fix(phase2): Élimination FP cross-line + word boundaries — 0 fuite, 0 FP médical
- Remplace \s+ par [ \t]+ dans 11 regex d'extraction de noms (empêche capture cross-line de médicaments) - Ajoute \b word boundaries dans RE_PERSON_CONTEXT (empêche "PDR" de matcher "DR") - Ajoute filtrage _MEDICAL_STOP_WORDS_SET dans selective_rescan._rescan_person - Ajoute stop words : labos pharma (MYL/VTS/ARW/PAN/MSO), dosages (FAIBLE/FORT), anatomie imagerie (CEREBRAL/ABDOMINO-PELVIEN) - Filtre stop words dans _add_name_force et _add_tokens_force_first - Mise à jour baseline regression_tests/ avec 29 fichiers du batch audit 30 Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -191,7 +191,8 @@ RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
|
||||
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
|
||||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
|
||||
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||
RE_IPP = re.compile(r"\b(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||
RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", re.IGNORECASE)
|
||||
RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||||
RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE)
|
||||
@@ -294,8 +295,12 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"meropenem", "imipenem", "clindamycine", "doxycycline",
|
||||
"azithromycine", "clarithromycine", "cotrimoxazole", "bactrim",
|
||||
"polyionique", "propranolol", "apidra", "solostar",
|
||||
# Suffixes laboratoires pharmaceutiques
|
||||
# Noms et suffixes laboratoires pharmaceutiques
|
||||
"arw", "myl", "myp", "arg", "teva", "bga", "agt",
|
||||
"mylan", "biogaran", "arrow", "sandoz", "zentiva", "cristers",
|
||||
"accord", "viatris", "ranbaxy", "ratiopharm", "almus", "qualimed",
|
||||
"evolugen", "alter", "zydus", "medisol", "substipharm",
|
||||
"sdz", "bgr", "egt", "rnb",
|
||||
# Formes galéniques / voies d'administration
|
||||
"cpr", "sachet", "orale", "oral", "sol", "buv", "stylo", "flexpen",
|
||||
"flestouch", "kwikpen", "inj", "susp", "gelule", "comprime",
|
||||
@@ -374,6 +379,9 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"montelukast", "rosuvastatine",
|
||||
# Abréviations pharma courtes
|
||||
"cpr", "sol", "bic", "agt", "poche", "inhal",
|
||||
# Termes chirurgicaux/cliniques FP
|
||||
"cure", "endocrine", "operatoire", "opératoire", "realisee", "réalisée",
|
||||
"gauche", "droit", "droite", "face", "profil",
|
||||
# Faux positifs EDS supplémentaires
|
||||
"psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta",
|
||||
"axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med",
|
||||
@@ -444,6 +452,20 @@ _MEDICAL_STOP_WORDS_SET = {
|
||||
"thermie", "alim", "alimentation", "admin",
|
||||
# Médicaments/tests labo capturés par patterns soignants
|
||||
"biprofenid", "bi-profenid", "phosphatase", "phosphatases",
|
||||
"ecbu", "suppo", "suppositoire", "mucite", "microlax", "normacol",
|
||||
"ciprofloxacine", "lavement", "desinfection", "désinfection",
|
||||
"avaler", "rachis", "lombaire", "thoraco-lombaire",
|
||||
"cérébrale", "cerebrale", "cérébral", "cerebral", "hépatique", "hepatique",
|
||||
"thoracique", "abdominale", "abdominal", "pelvienne", "pelvien", "médullaire",
|
||||
# Dosages et labos pharma (FP fréquents dans prescriptions Trackare)
|
||||
"faible", "fort", "forte",
|
||||
"myl", "mylan", "sandoz", "teva", "arrow", "biogaran", "zentiva", "eg",
|
||||
"arw", "pan", "mso", "bgn", "ratiopharm", "accord", "vts", "viatris",
|
||||
"abdomino-pelvien", "abdomino", "pelvien", "thoraco-abdominal",
|
||||
"entree", "entrée", "continu", "continue",
|
||||
"morphine", "claforan", "skenan", "actiskenan",
|
||||
# Fragments de noms de médicaments (pdfplumber split)
|
||||
"sium", "pegic", "fenid", "profenid",
|
||||
# Termes structurels trackare
|
||||
"transmissions", "transmission", "releve", "relevé",
|
||||
"objectif", "objectifs", "evaluation", "évaluation",
|
||||
@@ -507,11 +529,11 @@ _MEDICAL_STOP_WORDS = (
|
||||
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
|
||||
_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+"
|
||||
RE_PERSON_CONTEXT = re.compile(
|
||||
r"(?:(?:Dr\.?|DR\.?|Docteur|Pr\.?|Professeur|Mme|MME|Madame|M\.|Mr\.?|Monsieur"
|
||||
r"|Nom\s*:\s*"
|
||||
r"|Rédigé\s+par|Validé\s+par|Signé\s+par|Saisi\s+par|Réalisé\s+par"
|
||||
r")\s+)"
|
||||
rf"({_PERSON_TOKEN}(?:\s+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots
|
||||
r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur"
|
||||
r"|\bNom[ \t]*:[ \t]*"
|
||||
r"|\bRédigé[ \t]+par|\bValidé[ \t]+par|\bSigné[ \t]+par|\bSaisi[ \t]+par|\bRéalisé[ \t]+par"
|
||||
r")[ \t]+)"
|
||||
rf"({_PERSON_TOKEN}(?:[ \t]+{_PERSON_TOKEN}){{0,2}})" # Max 3 mots, pas de newline
|
||||
)
|
||||
|
||||
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
|
||||
@@ -561,8 +583,8 @@ RE_EXTRACT_CONTACT = re.compile(
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?",
|
||||
)
|
||||
RE_EXTRACT_REDIGE = re.compile(
|
||||
r"(?:Rédigé|Validé|Signé|Saisi)\s+par\s+"
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||
r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+"
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||||
)
|
||||
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
|
||||
_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*"
|
||||
@@ -573,22 +595,34 @@ RE_EXTRACT_MME_MR = re.compile(
|
||||
)
|
||||
_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?"
|
||||
RE_EXTRACT_DR_DEST = re.compile(
|
||||
r"(?:DR\.?|Dr\.?|Docteur)\s+"
|
||||
r"(?:DR\.?|Dr\.?|Docteur)[ \t]+"
|
||||
+ _INITIAL_OPT +
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||||
)
|
||||
# Noms du personnel médical après un rôle : "Aide : Marie-Paule BORDABERRY"
|
||||
RE_EXTRACT_STAFF_ROLE = re.compile(
|
||||
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre\s+Infirmier"
|
||||
r"|Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*"
|
||||
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:\s*-\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?\s+)?"
|
||||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[\s\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
|
||||
r"(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH?|Cadre[ \t]+Infirmier"
|
||||
r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)[ \t]*:?[ \t]*"
|
||||
r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?"
|
||||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})",
|
||||
)
|
||||
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
|
||||
RE_EXTRACT_PR = re.compile(
|
||||
r"(?:Pr\.?|Professeur)\s+"
|
||||
r"(?:Pr\.?|Professeur)[ \t]+"
|
||||
+ _INITIAL_OPT +
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)",
|
||||
rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})",
|
||||
)
|
||||
# "Opérateur : Docteur X. NOM", "Anesthésiste(s) Docteur J. NOM",
|
||||
# "Opérateur : Dr J.-M. NOM", "Anesthésiste : NOM"
|
||||
RE_EXTRACT_OPERATEUR = re.compile(
|
||||
r"(?:Op[ée]rateur|Anesth[ée]siste\(?s?\)?|Chirurgien)[ \t]*:?[ \t]*"
|
||||
r"(?:(?:Docteur|Dr\.?|Pr\.?)[ \t]+)?"
|
||||
+ _INITIAL_OPT +
|
||||
rf"((?:{_UC_COMPOUND})(?:[ \t]+(?:{_UC_COMPOUND})){{0,2}})",
|
||||
)
|
||||
# Téléphone avec extension slash : 05.59.44.38.32/34
|
||||
RE_TEL_SLASH = re.compile(
|
||||
r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?:/\d{1,4})(?!\d)"
|
||||
)
|
||||
|
||||
CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||||
@@ -596,7 +630,7 @@ CID_PATTERN = re.compile(r"\(cid:\d+\)")
|
||||
# --- Nouvelles regex : dates, adresses, âges, dossiers ---
|
||||
_MOIS_FR = r"(?:janvier|février|mars|avril|mai|juin|juillet|août|septembre|octobre|novembre|décembre)"
|
||||
RE_DATE_NAISSANCE = re.compile(
|
||||
r"(?:n[ée]+\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*"
|
||||
r"(\d{1,2}[\s/.\-]\d{1,2}[\s/.\-]\d{2,4}|\d{1,2}\s+" + _MOIS_FR + r"\s+\d{4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
@@ -947,6 +981,10 @@ def _mask_admin_label(line: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "IPP", val, PLACEHOLDERS["IPP"]))
|
||||
return RE_IPP.sub(lambda _: f"IPP : {PLACEHOLDERS['IPP']}", line)
|
||||
m = RE_CSULT.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "DOSSIER", val, PLACEHOLDERS["DOSSIER"]))
|
||||
return RE_CSULT.sub(lambda _: f"N° : {PLACEHOLDERS['DOSSIER']}", line)
|
||||
m = RE_RPPS.search(line)
|
||||
if m:
|
||||
val = m.group(1); audit.append(PiiHit(page_idx, "RPPS", val, PLACEHOLDERS["RPPS"]))
|
||||
@@ -975,6 +1013,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
||||
def _repl_tel(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||
return PLACEHOLDERS["TEL"]
|
||||
line = RE_TEL_SLASH.sub(_repl_tel, line) # slash d'abord (plus spécifique)
|
||||
line = RE_TEL.sub(_repl_tel, line)
|
||||
line = RE_TEL_COMPACT.sub(_repl_tel, line)
|
||||
|
||||
@@ -1140,6 +1179,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
|
||||
def _repl_tel(m: re.Match) -> str:
|
||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||
return PLACEHOLDERS["TEL"]
|
||||
key = RE_TEL_SLASH.sub(_repl_tel, key)
|
||||
key = RE_TEL.sub(_repl_tel, key)
|
||||
key = RE_TEL_COMPACT.sub(_repl_tel, key)
|
||||
def _repl_email(m: re.Match) -> str:
|
||||
@@ -1200,6 +1240,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
return
|
||||
if tok.lower() in _FORCE_EXCLUDE:
|
||||
return
|
||||
# Filtre supplémentaire : ne pas force-add les mots médicaux connus
|
||||
if tok.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
return
|
||||
names.add(tok)
|
||||
force_names.add(tok)
|
||||
|
||||
@@ -1324,10 +1367,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
|
||||
# --- Noms soignants sur la même ligne que "Note d'évolution" (ex: "Note d'évolution LACLAU-") ---
|
||||
for m in re.finditer(
|
||||
r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s+"
|
||||
r"(?:DR\.?\s+)?"
|
||||
r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+"
|
||||
r"(?:DR\.?[ \t]+)?"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
full_text
|
||||
):
|
||||
for g in (m.group(1), m.group(2)):
|
||||
@@ -1337,9 +1380,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
_add_name_force(tok)
|
||||
|
||||
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") ---
|
||||
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
|
||||
for m in re.finditer(
|
||||
r"Signé\s+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)"
|
||||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
full_text
|
||||
):
|
||||
for g in (m.group(1), m.group(2)):
|
||||
@@ -1350,9 +1394,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
|
||||
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") ---
|
||||
for m in re.finditer(
|
||||
r"Signé\s+—\s+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)\s+[-]?\s*"
|
||||
r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
||||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
||||
full_text
|
||||
):
|
||||
for g in (m.group(1), m.group(2)):
|
||||
@@ -1363,7 +1407,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
|
||||
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") ---
|
||||
for m in re.finditer(
|
||||
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?\s+"
|
||||
r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{2,})",
|
||||
full_text
|
||||
):
|
||||
@@ -1373,8 +1417,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
|
||||
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions ---
|
||||
for m in re.finditer(
|
||||
r"DR\.?\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,})"
|
||||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?",
|
||||
full_text
|
||||
):
|
||||
for g in (m.group(1), m.group(2)):
|
||||
@@ -1387,9 +1431,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
# Format Trackare : actions de soins suivies de "HH:MM NOM" ou "HH : MM NOM"
|
||||
# Pattern restrictif : nom ALL-CAPS de 4+ lettres, filtre stop words (pattern bruyant)
|
||||
for m in re.finditer(
|
||||
r"\d{1,2}\s*:\s*\d{2}\s+"
|
||||
r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+"
|
||||
r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})"
|
||||
r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
||||
r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{2,}))?",
|
||||
full_text
|
||||
):
|
||||
for g in (m.group(1), m.group(2)):
|
||||
@@ -1415,13 +1459,15 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]:
|
||||
return filtered, hits, force_names
|
||||
|
||||
|
||||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, set]:
|
||||
"""Pré-scan du document brut pour extraire les noms de personnes
|
||||
depuis les champs structurés (Patient, Rédigé par, etc.).
|
||||
Retourne un ensemble de tokens (mots) à masquer globalement."""
|
||||
Retourne (names, force_names) : ensemble de tokens à masquer,
|
||||
et sous-ensemble qui bypass les stop words."""
|
||||
wl_sections = set((cfg.get("whitelist", {}) or {}).get("sections_titres", []) or [])
|
||||
wl_phrases = set((cfg.get("whitelist", {}) or {}).get("noms_maj_excepts", []) or [])
|
||||
names: set = set()
|
||||
force_names: set = set()
|
||||
|
||||
def _add_tokens(match_str: str):
|
||||
for token in match_str.split():
|
||||
@@ -1434,6 +1480,17 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
continue
|
||||
names.add(token)
|
||||
|
||||
def _add_tokens_force_all(match_str: str):
|
||||
"""Bypass stop words pour TOUS les tokens (contexte Patient: très fiable)."""
|
||||
for token in match_str.split():
|
||||
token = token.strip(" .-'")
|
||||
if len(token) < 2:
|
||||
continue
|
||||
if token.upper() in wl_sections or token in wl_phrases:
|
||||
continue
|
||||
names.add(token)
|
||||
force_names.add(token)
|
||||
|
||||
def _add_tokens_force_first(match_str):
|
||||
"""Comme _add_tokens mais force le 1er token (contexte Dr/Mme fort)."""
|
||||
tokens = match_str.split()
|
||||
@@ -1441,21 +1498,20 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
token = token.strip(" .-'")
|
||||
if len(token) < 2:
|
||||
continue
|
||||
if token.upper() in wl_sections or token in wl_phrases:
|
||||
continue
|
||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
if i == 0:
|
||||
# Premier token après Dr/Mme : toujours un nom, bypass stop words
|
||||
if token.upper() not in wl_sections:
|
||||
names.add(token)
|
||||
# Premier token après Dr/Mme : contexte fiable
|
||||
names.add(token)
|
||||
else:
|
||||
if len(token) < 3:
|
||||
continue
|
||||
if token.upper() in wl_sections or token in wl_phrases:
|
||||
continue
|
||||
if token.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
continue
|
||||
names.add(token)
|
||||
|
||||
for m in RE_EXTRACT_PATIENT.finditer(full_text):
|
||||
_add_tokens(m.group(1))
|
||||
_add_tokens_force_all(m.group(1))
|
||||
for m in RE_EXTRACT_REDIGE.finditer(full_text):
|
||||
_add_tokens(m.group(1))
|
||||
for m in RE_EXTRACT_MME_MR.finditer(full_text):
|
||||
@@ -1482,6 +1538,9 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
# Pr / Professeur + nom(s)
|
||||
for m in RE_EXTRACT_PR.finditer(full_text):
|
||||
_add_tokens_force_first(m.group(1))
|
||||
# Opérateur / Anesthésiste / Chirurgien + nom(s)
|
||||
for m in RE_EXTRACT_OPERATEUR.finditer(full_text):
|
||||
_add_tokens_force_first(m.group(1))
|
||||
|
||||
# Extraction des noms dans les listes virgulées après Dr/Docteur
|
||||
# ex: "le Dr DUVAL, MACHELART, CHARLANNE, LAZARO, il a été proposé"
|
||||
@@ -1509,7 +1568,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> set:
|
||||
if len(part) >= 3 and part.lower() not in _MEDICAL_STOP_WORDS_SET:
|
||||
names.add(part)
|
||||
|
||||
return names
|
||||
return names, force_names
|
||||
|
||||
|
||||
def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_names: set = None) -> str:
|
||||
@@ -1517,6 +1576,10 @@ def _apply_extracted_names(text: str, names: set, audit: List[PiiHit], force_nam
|
||||
placeholder = PLACEHOLDERS["NOM"]
|
||||
_force = force_names or set()
|
||||
safe_names = {n for n in names if len(n) >= 3 and (n in _force or n.lower() not in _MEDICAL_STOP_WORDS_SET)}
|
||||
# Ajouter un hit global (page=-1) par nom pour la redaction PDF raster
|
||||
# (un seul hit suffit — redact_pdf_raster cherche le token sur chaque page)
|
||||
for token in sorted(safe_names, key=len, reverse=True):
|
||||
audit.append(PiiHit(-1, "NOM_GLOBAL", token, placeholder))
|
||||
for token in sorted(safe_names, key=len, reverse=True):
|
||||
pattern = re.compile(rf"\b{re.escape(token)}\b", re.IGNORECASE)
|
||||
new_text = []
|
||||
@@ -1577,7 +1640,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
full_raw = "\n".join(pages_text) + "\n" + "\n".join(
|
||||
"\n".join(rows) for rows in tables_lines
|
||||
)
|
||||
extracted_names = _extract_document_names(full_raw, cfg)
|
||||
extracted_names, doc_force_names = _extract_document_names(full_raw, cfg)
|
||||
|
||||
# Phase 0b : si document Trackare, extraction renforcée des PII structurés
|
||||
is_trackare = _is_trackare_document(full_raw)
|
||||
@@ -1586,6 +1649,8 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
trackare_names, trackare_hits, trackare_force_names = _extract_trackare_identity(full_raw)
|
||||
extracted_names.update(trackare_names)
|
||||
audit.extend(trackare_hits)
|
||||
# Fusionner les force_names des deux sources
|
||||
all_force_names = doc_force_names | trackare_force_names
|
||||
|
||||
# Phase 0c : détection FINESS multiline (label et numéro sur lignes séparées,
|
||||
# avec possiblement 0-2 lignes intermédiaires masquées ou vides)
|
||||
@@ -1595,6 +1660,32 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
for m in _RE_FINESS_MULTILINE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "FINESS", m.group(1), PLACEHOLDERS["FINESS"]))
|
||||
|
||||
# Phase 0d : date de naissance multiline (label et date sur lignes séparées)
|
||||
# Ex: "Né(e) le :\n07/04/1943" ou "Date de naissance\n01/02/1950"
|
||||
_RE_DATE_NAISSANCE_MULTILINE = re.compile(
|
||||
r"(?:\bn[ée]+(?:\(?e?\)?)?\s+le|date\s+de\s+naissance|DDN)\s*[:\-]?\s*\n\s*"
|
||||
r"(\d{1,2}[/.\-]\d{1,2}[/.\-]\d{2,4})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in _RE_DATE_NAISSANCE_MULTILINE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "DATE_NAISSANCE", m.group(1), PLACEHOLDERS["DATE_NAISSANCE"]))
|
||||
|
||||
# Phase 0e : IPP multiline (N°Ipp :\n20023294 ou I.P.P. :\nS1032021)
|
||||
_RE_IPP_MULTILINE = re.compile(
|
||||
r"(?:I\.?P\.?P\.?|IPP|N°\s*Ipp)\s*[:\-]?\s*\n\s*([A-Za-z0-9]{6,})\b",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in _RE_IPP_MULTILINE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "IPP", m.group(1), PLACEHOLDERS["IPP"]))
|
||||
|
||||
# Phase 0f : DEMANDE N° multiline (DEMANDE N°\n2300261164)
|
||||
_RE_DEMANDE_MULTILINE = re.compile(
|
||||
r"DEMANDE\s+N[°o]?\s*\n\s*(\d{8,})",
|
||||
re.IGNORECASE,
|
||||
)
|
||||
for m in _RE_DEMANDE_MULTILINE.finditer(full_raw):
|
||||
audit.append(PiiHit(-1, "DOSSIER", m.group(1), PLACEHOLDERS["DOSSIER"]))
|
||||
|
||||
# Phase 1 : masquage ligne par ligne (regex classiques)
|
||||
out_pages: List[str] = []
|
||||
for i, page_txt in enumerate(pages_text):
|
||||
@@ -1620,7 +1711,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
||||
|
||||
# Phase 2 : application globale des noms extraits (rattrapage)
|
||||
if extracted_names:
|
||||
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=trackare_force_names)
|
||||
text_out = _apply_extracted_names(text_out, extracted_names, audit, force_names=all_force_names)
|
||||
|
||||
# Phase 2b : application globale des PiiHit (EPISODE, RPPS, FINESS)
|
||||
text_out = _apply_trackare_hits_to_text(text_out, audit)
|
||||
@@ -1806,6 +1897,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
protected, kept = strip_tables(text)
|
||||
# PII critiques (comme avant)
|
||||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||
protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected)
|
||||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
|
||||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||
@@ -1846,6 +1938,10 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
||||
tokens = [t for t in span.split() if t]
|
||||
if len(tokens) == 1 and len(tokens[0]) <= 3:
|
||||
return raw
|
||||
# Filtrer les termes médicaux (stop words)
|
||||
clean = [t for t in tokens if t.lower() not in _MEDICAL_STOP_WORDS_SET]
|
||||
if not clean:
|
||||
return raw
|
||||
return raw.replace(span, PLACEHOLDERS["NOM"])
|
||||
protected = RE_PERSON_CONTEXT.sub(_rescan_person, protected)
|
||||
res = list(protected)
|
||||
@@ -1971,7 +2067,7 @@ def redact_pdf_vector(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, oc
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
if compact != token:
|
||||
rects = page.search_for(compact)
|
||||
if not rects and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
|
||||
if not rects and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM"}:
|
||||
for word in token.split():
|
||||
word = word.strip(" .-'")
|
||||
if len(word) < 5 or word.lower() in _MEDICAL_STOP_WORDS_SET:
|
||||
@@ -2074,7 +2170,7 @@ def redact_pdf_raster(original_pdf: Path, audit: List[PiiHit], out_pdf: Path, dp
|
||||
if not found and h.kind in {"NIR", "IBAN", "TEL", "VLM_TEL", "VLM_NIR"}:
|
||||
compact = re.sub(r"\s+", "", token)
|
||||
found = page.search_for(compact)
|
||||
if not found and " " in token and h.kind in {"NOM", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
||||
if not found and " " in token and h.kind in {"NOM", "NOM_GLOBAL", "NOM_EXTRACTED", "NER_PER", "EDS_NOM",
|
||||
"VLM_NOM", "VLM_ETAB", "VLM_SERVICE"}:
|
||||
for word in token.split():
|
||||
word = word.strip(" .-'")
|
||||
@@ -2359,13 +2455,13 @@ def process_pdf(
|
||||
# 4b) Propagation globale SÉLECTIVE : uniquement pour les PII critiques
|
||||
# Les PII critiques (DATE_NAISSANCE, NIR, IPP, EMAIL) sont propagés sur toutes les pages
|
||||
# pour éviter les fuites sur les documents multi-pages (ex: CRO)
|
||||
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS"}
|
||||
|
||||
_CRITICAL_PII_TYPES = {"DATE_NAISSANCE", "NIR", "IPP", "EMAIL", "force_term", "force_regex", "FINESS", "DOSSIER"}
|
||||
|
||||
_global_pii: Dict[str, set] = {}
|
||||
for h in anon.audit:
|
||||
# Collecter TOUS les types pour analyse, mais ne propager que les critiques
|
||||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||||
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP",
|
||||
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", "NIR", "IPP", "DOSSIER",
|
||||
"force_term", "force_regex", "FINESS"}:
|
||||
# Traitement spécial pour DATE_NAISSANCE : extraire la date pure et générer toutes les variations
|
||||
if h.kind == "DATE_NAISSANCE":
|
||||
|
||||
Reference in New Issue
Block a user