diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 801b6be..9837a6e 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -322,7 +322,7 @@ _MEDICAL_STOP_WORDS_SET = { "depakote", "versatis", "humalog", "forxiga", "durogesic", "montelukast", "rosuvastatine", # Abréviations pharma courtes - "cpr", "sol", "bic", "agt", "poche", "inhal", "regina", + "cpr", "sol", "bic", "agt", "poche", "inhal", # Faux positifs EDS supplémentaires "psy", "inhales", "inhalés", "kwikpen", "lansoprazole", "tiorfan", "smecta", "axa", "ttt", "anionique", "abdomino", "cod", "omi", "urg", "med", @@ -891,6 +891,10 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit]]: for m in re.finditer(r"Nom\s+et\s+Pr[ée]nom\s*:\s*(.+?)(?:\s+Date\s+de\s+naissance|\s*$)", full_text, re.MULTILINE): _add_name(m.group(1).strip()) + # Prénom de naissance / Prénom utilisé : REGINA + for m in re.finditer(r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-']+?)(?:\s*$)", full_text, re.MULTILINE): + _add_name(m.group(1).strip()) + # Lieu de naissance: BAYONNE → masquer comme VILLE for m in re.finditer(r"Lieu\s+de\s+naissance\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\s\-']+?)(?:\s*$)", full_text, re.MULTILINE): val = m.group(1).strip() @@ -1698,13 +1702,14 @@ def process_pdf( _companion_tokens.add(candidate) _global_name_tokens.update(_companion_tokens) - # Retirer les sous-parties de noms composés (JEAN, PIERRE si JEAN-PIERRE existe) + # Retirer les sous-parties COURTES de noms composés (JEAN si JEAN-PIERRE existe) + # Garder les parties longues (>=5 chars) car le texte peut les scinder sur des lignes séparées _compound = {t for t in _global_name_tokens if "-" in t} _parts_to_drop = set() for comp in _compound: for part in comp.split("-"): part = part.strip() - if len(part) >= 2 and part in _global_name_tokens: + if len(part) >= 2 and len(part) < 5 and part in _global_name_tokens: _parts_to_drop.add(part) _global_name_tokens -= _parts_to_drop @@ -1755,7 +1760,11 @@ def process_pdf( if anon.is_trackare and h.kind == "NOM_GLOBAL" and len(token) <= 3: continue try: - final_text = re.sub(rf"\b{re.escape(token)}\b", h.placeholder, final_text) + pat = re.escape(token) + # Noms composés : tolérer les sauts de ligne/espaces autour du tiret + if "-" in token: + pat = pat.replace(r"\-", r"\-\s*") + final_text = re.sub(rf"\b{pat}\b", h.placeholder, final_text) except re.error: final_text = final_text.replace(token, h.placeholder)