Fix 62 fuites résiduelles : DATE_NAISSANCE global, CHCB, Centre Hospitalier de la Côte Basque
- RE_HOPITAL_VILLE : ajout articles (la/le/l'/les) après prépositions - DATE_NAISSANCE + force_term + force_regex : propagation globale cross-pages - Config : org_gpe_keep=false, CHCB + variantes Centre Hospitalier en force_mask - Audit 130 fichiers : 0 résidu (était 36 DATE_NAISS + 26 ETAB) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -528,9 +528,9 @@ RE_HOPITAL_VILLE = re.compile(
|
||||
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
|
||||
r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
|
||||
r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)"
|
||||
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
|
||||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||||
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
|
||||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||||
)
|
||||
RE_SERVICE = re.compile(
|
||||
@@ -1950,7 +1950,8 @@ def process_pdf(
|
||||
_global_pii: Dict[str, set] = {}
|
||||
for h in anon.audit:
|
||||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||||
"VLM_SERVICE", "VLM_ETAB"}:
|
||||
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE",
|
||||
"force_term", "force_regex"}:
|
||||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||||
for kind, values in _global_pii.items():
|
||||
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
||||
@@ -1958,7 +1959,7 @@ def process_pdf(
|
||||
anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder))
|
||||
|
||||
# 4e) Appliquer les tokens globaux sur le texte pseudonymisé
|
||||
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL", "DATE_NAISSANCE_GLOBAL"}
|
||||
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL"}
|
||||
for h in anon.audit:
|
||||
if h.page != -1:
|
||||
continue
|
||||
|
||||
@@ -13,13 +13,16 @@ whitelist:
|
||||
noms_maj_excepts:
|
||||
- Médecin DIM
|
||||
- Praticien conseil
|
||||
org_gpe_keep: true
|
||||
org_gpe_keep: false
|
||||
blacklist:
|
||||
force_mask_terms:
|
||||
- CENTRE HOSPITALIER COTE BASQUE
|
||||
- CENTRE HOSPITALIER DE LA COTE BASQUE
|
||||
- CHCB
|
||||
- 'Dates du séjour :'
|
||||
- CONCERTATION
|
||||
force_mask_regex: []
|
||||
force_mask_regex:
|
||||
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
|
||||
kv_labels_preserve:
|
||||
- FINESS
|
||||
- IPP
|
||||
|
||||
Reference in New Issue
Block a user