From 96d55584c7d8cf3b4dff42482ebd1f65177e4e9f Mon Sep 17 00:00:00 2001 From: Domi31tls Date: Fri, 27 Feb 2026 12:07:58 +0100 Subject: [PATCH] =?UTF-8?q?Fix=2062=20fuites=20r=C3=A9siduelles=20:=20DATE?= =?UTF-8?q?=5FNAISSANCE=20global,=20CHCB,=20Centre=20Hospitalier=20de=20la?= =?UTF-8?q?=20C=C3=B4te=20Basque?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - RE_HOPITAL_VILLE : ajout articles (la/le/l'/les) après prépositions - DATE_NAISSANCE + force_term + force_regex : propagation globale cross-pages - Config : org_gpe_keep=false, CHCB + variantes Centre Hospitalier en force_mask - Audit 130 fichiers : 0 résidu (était 36 DATE_NAISS + 26 ETAB) Co-Authored-By: Claude Opus 4.6 --- anonymizer_core_refactored_onnx.py | 9 +++++---- config/dictionnaires.yml | 7 +++++-- 2 files changed, 10 insertions(+), 6 deletions(-) diff --git a/anonymizer_core_refactored_onnx.py b/anonymizer_core_refactored_onnx.py index 87dc275..79ac306 100644 --- a/anonymizer_core_refactored_onnx.py +++ b/anonymizer_core_refactored_onnx.py @@ -528,9 +528,9 @@ RE_HOPITAL_VILLE = re.compile( r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier" r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté" r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)" - r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?" + r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?" r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)" - r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?" + r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?" r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)", ) RE_SERVICE = re.compile( @@ -1950,7 +1950,8 @@ def process_pdf( _global_pii: Dict[str, set] = {} for h in anon.audit: if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB", - "VLM_SERVICE", "VLM_ETAB"}: + "VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE", + "force_term", "force_regex"}: _global_pii.setdefault(h.kind, set()).add(h.original.strip()) for kind, values in _global_pii.items(): placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"]) @@ -1958,7 +1959,7 @@ def process_pdf( anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder)) # 4e) Appliquer les tokens globaux sur le texte pseudonymisé - _GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL", "DATE_NAISSANCE_GLOBAL"} + _GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL"} for h in anon.audit: if h.page != -1: continue diff --git a/config/dictionnaires.yml b/config/dictionnaires.yml index 34e5539..c22ce9c 100644 --- a/config/dictionnaires.yml +++ b/config/dictionnaires.yml @@ -13,13 +13,16 @@ whitelist: noms_maj_excepts: - Médecin DIM - Praticien conseil - org_gpe_keep: true + org_gpe_keep: false blacklist: force_mask_terms: - CENTRE HOSPITALIER COTE BASQUE + - CENTRE HOSPITALIER DE LA COTE BASQUE + - CHCB - 'Dates du séjour :' - CONCERTATION - force_mask_regex: [] + force_mask_regex: + - 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque' kv_labels_preserve: - FINESS - IPP