diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index be4f5f3..2cfac5d 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -376,6 +376,7 @@ PLACEHOLDERS = { "NDA": "[NDA]", "EPISODE": "[EPISODE]", "RPPS": "[RPPS]", + "ADHERENT": "[ADHERENT]", } CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"} @@ -391,6 +392,10 @@ RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", r RE_FINESS = re.compile(r"\b(?:N°\s*)?FINESS?\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE) RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE) RE_RPPS = re.compile(r"\b(?:N°\s*)?RPPS\s*[:\-]?\s*(\d{8,11})\b", re.IGNORECASE) +RE_NUM_ADHERENT = re.compile( + r"\b(?:n[°o]?\s*|num[ée]ro\s+(?:d['’]\s*)?)adh[ée]rent[e]?\s*[:\-]?\s*([A-Z0-9]{6,15})\b", + re.IGNORECASE, +) RE_NIR = re.compile( r"\b([12])\s*(\d{2})\s*(0[1-9]|1[0-2]|2[AB])\s*(\d{2,3})\s*(\d{3})\s*(\d{3})\s*(\d{2})\b", re.IGNORECASE, @@ -457,7 +462,7 @@ def _refresh_medical_stopwords_pattern() -> None: _refresh_medical_stopwords_pattern() # Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point) -_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" +_PERSON_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\']+" RE_PERSON_CONTEXT = re.compile( r"(?:(?:\bDr\.?|\bDR\.?|\bDocteur|\bPr\.?|\bProfesseur|\bMme|\bMME|\bMadame|\bM\.|\bMr\.?|\bMonsieur" r"|\bNom[ \t]*:[ \t]*" @@ -469,16 +474,16 @@ RE_PERSON_CONTEXT = re.compile( # Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO") RE_DR_COMMA_LIST = re.compile( r"(?:Dr\.?|DR\.?|Docteur)\s+" - r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+" - r"(?:\s*,\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' .]+)+", + r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' .]+" + r"(?:\s*,\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' .]+)+", re.IGNORECASE, ) # Token nom : mot commençant par une majuscule d'au moins 3 lettres -_NAME_TOKEN_RE = re.compile(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']{2,}") +_NAME_TOKEN_RE = re.compile(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\']{2,}") SPLITTER = re.compile(r"\s*[:|;\t]\s*") # --- Extraction globale de noms depuis champs structurés --- -_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+" +_UC_NAME_TOKEN = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\']+" RE_EXTRACT_PATIENT = re.compile( r"Patient\(?e?\)?\s*:\s*" rf"((?:{_UC_NAME_TOKEN})(?:\s+(?:{_UC_NAME_TOKEN}))*)" @@ -488,53 +493,53 @@ RE_EXTRACT_PATIENT = re.compile( # Champs d'identité structurés (documents trackare / DPI) RE_EXTRACT_NOM_NAISSANCE = re.compile( r"Nom\s+de\s+naissance\s*:\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+IPP|\s*$)", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+?)(?:\s+IPP|\s*$)", re.MULTILINE, ) RE_EXTRACT_NOM_PRENOM = re.compile( r"Nom\s+et\s+Pr[ée]nom\s*:\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s+Date|\s+Né|\s*$)", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+?)(?:\s+Date|\s+Né|\s*$)", re.MULTILINE, ) RE_EXTRACT_NOM_UTILISE = re.compile( r"Nom\s+utilis[ée]\s*:\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+?)(?:\s*$)", re.MULTILINE, ) RE_EXTRACT_PRENOM = re.compile( r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+?)(?:\s*$)", re.MULTILINE, ) RE_EXTRACT_LIEU_NAISSANCE = re.compile( r"Lieu\s+de\s+naissance\s*:\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+?)(?:\s*$)", re.MULTILINE, ) RE_EXTRACT_VILLE_RESIDENCE = re.compile( r"Ville\s+de\s+r[ée]sidence\s*:\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+?)(?:\s*$)", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+?)(?:\s*$)", re.MULTILINE, ) # Contacts structurés : Conjoint/Concubin/Epoux/Epouse/Parent + NOM PRENOM RE_EXTRACT_CONTACT = re.compile( r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur)\s+" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']+))?", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\']+)" + r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\']+))?", ) RE_EXTRACT_REDIGE = re.compile( r"(?:Rédigé|Validé|Signé|Saisi)[ \t]+par[ \t]+" rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,2}})", ) # Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc. -_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,})*" +_UC_COMPOUND = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]{2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]{2,})*" RE_EXTRACT_MME_MR = re.compile( r"(?:MMES|MME|Mmes|Mme|Madame|Mesdames|Monsieur|Messieurs|Mrs|Mr\.?)\s+" - r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?" + r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]\.\s*)?)?" rf"((?:{_UC_NAME_TOKEN})(?:[ \t]+(?:{_UC_NAME_TOKEN})){{0,4}})", ) # Listes virgulées après civilité : "Mmes Anorga, Goyenaga, Martinez et Murcy" -_CNAME = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\']" +_CNAME = r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\']" RE_CIVILITE_COMMA_LIST = re.compile( r"(?:Mmes|Mme|Mesdames|Mrs|Mr|Messieurs|Monsieur|Madame|Dr\.?|Docteur)\s+" + _CNAME + r"+" @@ -542,7 +547,7 @@ RE_CIVILITE_COMMA_LIST = re.compile( + r"(?:\s*,?\s*\bet\s+" + _CNAME + r"+)?", re.IGNORECASE, ) -_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\.\s*)?)?" +_INITIAL_OPT = r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]\.\s*(?:-?\s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]\.\s*)?)?" RE_EXTRACT_DR_DEST = re.compile( r"\b(?:DR\.?|Dr\.?|Docteur)[ \t]+" + _INITIAL_OPT + @@ -552,8 +557,8 @@ RE_EXTRACT_DR_DEST = re.compile( RE_EXTRACT_STAFF_ROLE = re.compile( r"\b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \t]+Infirmier" r"|Prescripteur|Prescrit[ \t]+par|Exécut[ée][ \t]+par|Réalisé[ \t]+par)\b[ \t]*:?[ \t]*" - r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \t]+)?" - r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{2,}){0,2})", + r"((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][a-zéèàùâêîôûäëïöüçñ]+(?:[ \t]*-[ \t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][a-zéèàùâêîôûäëïöüçñ]+)?[ \t]+)?" + r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]{2,}[\-]?)(?:[ \t\-]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]{2,}){0,2})", ) # "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL" RE_EXTRACT_PR = re.compile( @@ -589,7 +594,7 @@ CID_PATTERN = re.compile(r"\(cid:\d+\)") # --- Mr/Mme + initiale isolée (ex: "Mme Z", "Mr R") --- RE_CIVILITE_INITIALE = re.compile( - r"\b((?:Mme|MME|Madame|Monsieur|Mr\.?|M\.)\s+)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])(?=[\s,.\-\)\]:;!?]|$)" + r"\b((?:Mme|MME|Madame|Monsieur|Mr\.?|M\.)\s+)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ])(?=[\s,.\-\)\]:;!?]|$)" ) # --- N° examen / N° patient imagerie (radiologie) --- @@ -611,7 +616,7 @@ RE_NUM_ACCESSION_HEADER = re.compile( # --- Adresses lieu-dit / maison basque / lotissement --- RE_ADRESSE_LIEU_DIT = re.compile( r"\b(?:MAISON|LOT|LOTISSEMENT|RESIDENCE|RÉSIDENCE|MAS|LIEU[\s\-]DIT|DOMAINE|HAMEAU|QUARTIER)\s+" - r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']{2,}" + r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-']{2,}" r"(?:\s+\d{1,4})?", re.IGNORECASE, ) @@ -638,7 +643,7 @@ RE_ADRESSE = re.compile( r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*" r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence" r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)" - r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}", + r"\s+[A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\s\-']{2,}", re.IGNORECASE, ) RE_CODE_POSTAL = re.compile( @@ -646,12 +651,12 @@ RE_CODE_POSTAL = re.compile( r"|" # 5 chiffres + nom de ville (Title Case ou MAJUSCULES), pas précédé d'un chiffre (évite RPPS) # Exclure les unités médicales (UI, mg, ml, etc.) via negative lookahead - r"(?:(? str: + raw = m.group(0) + if not validate_nir(raw): + return raw # faux positif, on ne masque pas + audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"])) + return PLACEHOLDERS["NIR"] + line = RE_NIR.sub(_repl_nir, line) + # TEL def _repl_tel(m: re.Match) -> str: audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"])) @@ -1260,15 +1277,6 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict return PLACEHOLDERS["IBAN"] line = RE_IBAN.sub(_repl_iban, line) - # NIR (avec validation clé modulo 97) - def _repl_nir(m: re.Match) -> str: - raw = m.group(0) - if not validate_nir(raw): - return raw # faux positif, on ne masque pas - audit.append(PiiHit(page_idx, "NIR", raw, PLACEHOLDERS["NIR"])) - return PLACEHOLDERS["NIR"] - line = RE_NIR.sub(_repl_nir, line) - # DATE_NAISSANCE (plus spécifique, avant DATE générique) def _repl_date_naissance(m: re.Match) -> str: audit.append(PiiHit(page_idx, "DATE_NAISSANCE", m.group(0), PLACEHOLDERS["DATE_NAISSANCE"])) @@ -1346,6 +1354,14 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict return full[:full.find(val)] + PLACEHOLDERS["NDA"] line = RE_VENUE_SEJOUR.sub(_repl_venue, line) + # N° adhérent (mutuelle / complémentaire santé) + def _repl_adherent(m: re.Match) -> str: + val = m.group(1) + audit.append(PiiHit(page_idx, "ADHERENT", val, PLACEHOLDERS["ADHERENT"])) + full = m.group(0) + return full[:full.find(val)] + PLACEHOLDERS["ADHERENT"] + line = RE_NUM_ADHERENT.sub(_repl_adherent, line) + # Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.) def _repl_etab(m: re.Match) -> str: audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["ETAB"])) @@ -1369,11 +1385,11 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict # Stratégie : si un segment contient un mot-clé d'établissement, masquer TOUTE la ligne # espacée (tous les segments contigus) pour éviter de laisser "D E L A C ÔT E B A S Q U E" _RE_SPACED_TEXT = re.compile( - r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]' + r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]' ) # Pattern plus large : toute la chaîne espacée (lettres séparées par espaces + mots courts) _RE_SPACED_FULL_LINE = re.compile( - r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ](?:\s|ÔT|ÉE)){3,}[\sA-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]*' + r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ](?:\s|ÔT|ÉE)){3,}[\sA-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]*' ) _SPACED_ETAB_KEYWORDS = { "HOSPITALIER", "HOSPITALIERE", "HOSPITALIERES", "HOSPITALIERS", @@ -1428,9 +1444,9 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict # Autorise les mots de liaison minuscules (de, du, la, sur, en, lès) _re_ville_date = re.compile( r"^(\s*)" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç\-]+" + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][a-zéèàùâêîôûäëïöüçñ\-]+" r"(?:\s+(?:de|du|la|sur|en|lès|les|l['']\s*)?" - r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)*)" + r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)*)" r"(\s*,\s+le\s+\d{1,2})", re.MULTILINE, ) @@ -1447,7 +1463,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict return m.group(1) + PLACEHOLDERS["VILLE"] line = _re_lieu.sub(_repl_lieu, line) - _re_ville_res = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)") + _re_ville_res = re.compile(r"(Ville\s+de\s+r[ée]sidence\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-\' ]+)") def _repl_ville_res(m: re.Match) -> str: audit.append(PiiHit(page_idx, "VILLE", m.group(2).strip(), PLACEHOLDERS["VILLE"])) return m.group(1) + PLACEHOLDERS["VILLE"] @@ -1576,10 +1592,15 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str: audit.append(PiiHit(page_idx, "NDA", m.group(1), PLACEHOLDERS["NDA"])) return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["NDA"]) + def _repl_adherent(m: re.Match) -> str: + audit.append(PiiHit(page_idx, "ADHERENT", m.group(1), PLACEHOLDERS["ADHERENT"])) + return _replace_captured_value(m.group(0), m.group(1), PLACEHOLDERS["ADHERENT"]) + masked = RE_CODE_POSTAL.sub(_repl_code_postal, line) masked = RE_NUM_EXAMEN_PATIENT.sub(_repl_num_examen, masked) masked = RE_NUMERO_DOSSIER.sub(_repl_dossier, masked) masked = RE_VENUE_SEJOUR.sub(_repl_venue, masked) + masked = RE_NUM_ADHERENT.sub(_repl_adherent, masked) return masked @@ -1681,7 +1702,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, _add_name(m.group(1).strip(), "trackare_nom_prenom", "high") # Prénom de naissance / Prénom utilisé : REGINA - for m in re.finditer(r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-']+?)(?:\s*$)", full_text, re.MULTILINE): + for m in re.finditer(r"Pr[ée]nom\s+(?:de\s+naissance|utilis[ée])\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\s\-']+?)(?:\s*$)", full_text, re.MULTILINE): _add_name(m.group(1).strip(), "trackare_prenom", "high") # Lieu de naissance: BAYONNE, biarritz, 64102, 99999 → masquer comme VILLE @@ -1690,11 +1711,11 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, if val: hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"])) # Ajouter au set names seulement si alphabétique (pas les codes INSEE numériques) - if re.match(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç]", val): + if re.match(r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ]", val): names.add(val) # Ville de résidence: TARNOS → masquer comme VILLE - for m in re.finditer(r"Ville\s+de\s+r[ée]sidence\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû\s\-']+?)(?:\s*$)", full_text, re.MULTILINE): + for m in re.finditer(r"Ville\s+de\s+r[ée]sidence\s*:\s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ\s\-']+?)(?:\s*$)", full_text, re.MULTILINE): val = m.group(1).strip() hits.append(PiiHit(-1, "VILLE", val, PLACEHOLDERS["VILLE"])) names.add(val) @@ -1736,9 +1757,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # Inclut "Personne à prévenir" + relations + Ami/Voisin/Autre for m in re.finditer( r"(?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur|Ami|Amie|Voisin|Voisine|Autre)\s+" - r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?", + r"([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñä\-']+)" + r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñä\-']+))?" + r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñä\-']+))?", full_text, ): contact_parts = [g.strip(" .-'(),") for g in (m.group(1), m.group(2), m.group(3)) if g] @@ -1763,9 +1784,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, for m in re.finditer( r"[Pp]ersonne\s+[àa]\s+pr[ée]venir\s*[:\-]?\s*\n" r"(?:[^\n]{0,30}\n){0,2}" # 0-2 lignes intermédiaires (relation, etc.) - r"\s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?" - r"(?:\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä\-']+))?", + r"\s*([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñä\-']+)" + r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñä\-']+))?" + r"(?:\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñä\-']+))?", full_text, ): for g in (m.group(1), m.group(2), m.group(3)): @@ -1780,8 +1801,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, for m in re.finditer( r"(?:Prescripteur|Prescrit\s+par|Exécut[ée]\s+par|Réalisé\s+par)\s*:?\s*" r"(?:(?:Dr|Pr)\.?\s+)?" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+))?", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-']+)" + r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-']+))?", full_text, ): _add_name(m.group(1), "trackare_prescripteur", "medium") @@ -1789,12 +1810,12 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, _add_name(m.group(2), "trackare_prescripteur", "medium") # --- Médecins urgences (IAO, prise en charge, décision) (medium context) --- - for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)", full_text): + for m in re.finditer(r"IAO\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)", full_text): _add_name(m.group(1), "trackare_iao", "medium") for m in re.finditer( r"Médecin\s+de\s+la\s+(?:prise\s+en\s+charge|décision)\s+médicale\s+" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+)" - r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]+))?", + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+)" + r"(?:\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]+))?", full_text, ): _add_name(m.group(1), "trackare_medecin_urgences", "medium") @@ -1805,8 +1826,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # Pattern: "Note IDE\nPrenom NOM" ou "Note d'évolution\nPrenom NOM" for m in re.finditer( r"Note\s+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])\s*\n\s*" - r"([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][a-zéèàùâêîôûäëïöüç]+)\s+" - r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)", + r"([A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñ][a-zéèàùâêîôûäëïöüçñ]+)\s+" + r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]+)", full_text ): prenom, nom = m.group(1), m.group(2) @@ -1819,7 +1840,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # --- Noms soignants multi-lignes : "Prénom\nNOM" dans les tableaux de prescriptions/soins (low context) --- for m in re.finditer( - r'\b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,})\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛ]{4,})\b', + r'\b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ]{3,})\s*\n\s*([A-ZÉÈÀÙÂÊÎÔÛ]{4,})\b', full_text ): prenom, nom = m.group(1), m.group(2) @@ -1833,8 +1854,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, for m in re.finditer( r"Note[ \t]+(?:IDE|AS|d'[ée]volution|m[ée]dicale|kin[ée])[ \t]+" r"(?:DR\.?[ \t]+)?" - r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" - r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", + r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]+)" + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]+))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1846,8 +1867,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") (low context) --- # IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante for m in re.finditer( - r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+)" - r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", + r"Signé[ \t]+(?!—|par\b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]+)" + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]+))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1859,8 +1880,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") (low context) --- for m in re.finditer( r"Signé[ \t]+—[ \t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \t]+[-]?[ \t]*" - r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{3,})" - r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,}))?", + r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]{3,})" + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ]{3,}))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1873,7 +1894,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") (low context) --- for m in re.finditer( r"(?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?:\(s\))?[ \t]+" - r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]{3,})", + r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]{3,})", full_text ): tok = m.group(1).rstrip('-') @@ -1883,8 +1904,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, # --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions (medium context) --- for m in re.finditer( - r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,})" - r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\-]+))?", + r"DR\.?[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ]{3,})" + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑa-zéèàùâêîôûñäëïöüçñ\-]+))?", full_text ): for g in (m.group(1), m.group(2)): @@ -1899,7 +1920,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set, for m in re.finditer( r"\d{1,2}[ \t]*:[ \t]*\d{2}[ \t]+" r"([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ\-]{3,})" - r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç]{3,}))?", + r"(?:[ \t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ]{3,}))?", full_text ): for g in (m.group(1), m.group(2)): @@ -2107,7 +2128,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s # Dr X.NOM / Pr X.NOM : initiale collée au nom (ex: "Dr E.ELLIE", "Pr J.DUPONT") _RE_DR_INITIAL_DOT_NAME = re.compile( r"\b(?:Dr\.?|Docteur|Pr\.?|Professeur)[ \t]+" - r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-]{2,})" + r"([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ])\.([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\-]{2,})" ) for m in _RE_DR_INITIAL_DOT_NAME.finditer(full_text): names.add(m.group(2)) # Le nom (ELLIE) @@ -2135,8 +2156,8 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s # INSEE et un nom de famille INSEE. Les tokens proposés viennent # exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici. _UPPER_NAME_LINE_RE = re.compile( - r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ\-' ]+" - r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \t]*$", + r"^[ \t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ\-' ]+" + r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ])[ \t]*$", re.MULTILINE, ) for m in _UPPER_NAME_LINE_RE.finditer(full_text): @@ -3198,7 +3219,7 @@ def _mask_finess_addresses(text: str, return_matched_names: bool = False): suffix = text[end:min(len(text), end + 60)] # BP/CS + numéro + éventuel code postal + ville bp_match = re.match( - r'(\s*(?:BP|CS)\s*\d+\s*[,.]?\s*(?:\d{5}\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\s\-]+(?:CEDEX)?)?)', + r'(\s*(?:BP|CS)\s*\d+\s*[,.]?\s*(?:\d{5}\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑa-zéèàùâêîôûäëïöüçñ\s\-]+(?:CEDEX)?)?)', suffix, re.IGNORECASE) if bp_match: ext_end = end + len(bp_match.group(1).rstrip()) @@ -3533,14 +3554,15 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: ) protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected) protected = RE_URL.sub(PLACEHOLDERS["MASK"], protected) + # NIR d'abord (validation modulo 97), pour éviter qu'un NIR au format + # espacé soit consommé par RE_TEL. + def _rescan_nir(m: re.Match) -> str: + return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0) + protected = RE_NIR.sub(_rescan_nir, protected) protected = RE_TEL_SLASH.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected) protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected) protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected) - # NIR avec validation - def _rescan_nir(m: re.Match) -> str: - return PLACEHOLDERS["NIR"] if validate_nir(m.group(0)) else m.group(0) - protected = RE_NIR.sub(_rescan_nir, protected) # Nouvelles regex : dates de naissance, dates, adresses, codes postaux protected = RE_DATE_NAISSANCE.sub(PLACEHOLDERS["DATE_NAISSANCE"], protected) # protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé @@ -3575,7 +3597,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: # Adresses (gazetteer Aho-Corasick FINESS — 28K noms de voie) protected = _mask_finess_addresses(protected) # Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R" → [ETABLISSEMENT] - _re_spaced = re.compile(r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]') + _re_spaced = re.compile(r'(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]\s){4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]') _spaced_kw = {"HOSPITALIER", "HOSPITALIERE", "HOSPITALIERES", "HOSPITALIERS", "CLINIQUE", "HOPITAL", "HÔPITAL", "POLYCLINIQUE", "CENTRE", "ETABLISSEMENT", "MAISON", "RESIDENCE", @@ -3620,7 +3642,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str: lambda m: m.group(1) + PLACEHOLDERS["NOM"], protected ) # Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]" - _re_init_nom = re.compile(r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.[\s\-]*(\[NOM\])') + _re_init_nom = re.compile(r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ])\.[\s\-]*(\[NOM\])') protected = _re_init_nom.sub(r'[NOM] \2', protected) # Références initiales : "Ref : JF/VA" → "Ref : [NOM]/[NOM]" _re_ref_init = re.compile(r'(?:Ref\s*:\s*|Réf\s*:\s*)([A-Z]{1,3})\s*/\s*([A-Z]{1,3})\b') @@ -4214,7 +4236,7 @@ def process_pdf( # 3c) Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]" _RE_INITIAL_BEFORE_NOM = re.compile( - r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])\.[\s\-]*(\[NOM\])' + r'\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ])\.[\s\-]*(\[NOM\])' ) def _clean_initial_before_nom(m): anon.audit.append(PiiHit(-1, "NOM_INITIAL", m.group(1) + ".", PLACEHOLDERS["NOM"])) @@ -4269,14 +4291,14 @@ def process_pdf( _companion_tokens: set = set() for token in _global_name_tokens: # Token connu suivi d'un mot ALL-CAPS - for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\b", raw_full): + for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]{{4,}})\b", raw_full): candidate = m.group(1) if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens and candidate not in _COMPANION_BLACKLIST_SET): _companion_tokens.add(candidate) # Mot ALL-CAPS suivi du token connu - for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{4,}})\s+{re.escape(token)}\b", raw_full): + for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ]{{4,}})\s+{re.escape(token)}\b", raw_full): candidate = m.group(1) if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens diff --git a/tests/synthetic_review/cases/010_fiche_admission_minimale/expectations.json b/tests/synthetic_review/cases/010_fiche_admission_minimale/expectations.json index 642be55..f246daf 100644 --- a/tests/synthetic_review/cases/010_fiche_admission_minimale/expectations.json +++ b/tests/synthetic_review/cases/010_fiche_admission_minimale/expectations.json @@ -1,5 +1,6 @@ { "required_kinds": [ + "ADHERENT", "ADRESSE", "CODE_POSTAL", "DATE_NAISSANCE", @@ -37,7 +38,7 @@ "05 59 11 22 33", "sabine.darribehaude@example.com", "1234567890", - "2 73 04 65 100 100 88", + "2 73 04 65 100 100 68", "CHCB" ] } diff --git a/tests/synthetic_review/cases/010_fiche_admission_minimale/expected.txt b/tests/synthetic_review/cases/010_fiche_admission_minimale/expected.txt index 2211ff5..d00d74f 100644 --- a/tests/synthetic_review/cases/010_fiche_admission_minimale/expected.txt +++ b/tests/synthetic_review/cases/010_fiche_admission_minimale/expected.txt @@ -34,7 +34,7 @@ RPPS : [RPPS] Tel : [TEL] INFORMATIONS COMPLEMENTAIRES -Mutuelle : MGEN n°adhérent [NDA] +Mutuelle : MGEN n°adhérent [ADHERENT] Sécurité sociale : [NIR] Service de gastro-entérologie — chambre 412 diff --git a/tests/synthetic_review/cases/010_fiche_admission_minimale/test.txt b/tests/synthetic_review/cases/010_fiche_admission_minimale/test.txt index 2cba60f..b15c6e7 100644 --- a/tests/synthetic_review/cases/010_fiche_admission_minimale/test.txt +++ b/tests/synthetic_review/cases/010_fiche_admission_minimale/test.txt @@ -35,6 +35,6 @@ Tel : 05 59 11 22 33 INFORMATIONS COMPLEMENTAIRES Mutuelle : MGEN n°adhérent 1234567890 -Sécurité sociale : 2 73 04 65 100 100 88 +Sécurité sociale : 2 73 04 65 100 100 68 Service de gastro-entérologie — chambre 412