Fix 62 fuites résiduelles : DATE_NAISSANCE global, CHCB, Centre Hospitalier de la Côte Basque
- RE_HOPITAL_VILLE : ajout articles (la/le/l'/les) après prépositions - DATE_NAISSANCE + force_term + force_regex : propagation globale cross-pages - Config : org_gpe_keep=false, CHCB + variantes Centre Hospitalier en force_mask - Audit 130 fichiers : 0 résidu (était 36 DATE_NAISS + 26 ETAB) Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -528,9 +528,9 @@ RE_HOPITAL_VILLE = re.compile(
|
|||||||
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
|
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
|
||||||
r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
|
r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
|
||||||
r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)"
|
r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)"
|
||||||
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
|
||||||
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||||||
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?(?:la\s+|le\s+|l['']\s*|les\s+)?"
|
||||||
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||||||
)
|
)
|
||||||
RE_SERVICE = re.compile(
|
RE_SERVICE = re.compile(
|
||||||
@@ -1950,7 +1950,8 @@ def process_pdf(
|
|||||||
_global_pii: Dict[str, set] = {}
|
_global_pii: Dict[str, set] = {}
|
||||||
for h in anon.audit:
|
for h in anon.audit:
|
||||||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||||||
"VLM_SERVICE", "VLM_ETAB"}:
|
"VLM_SERVICE", "VLM_ETAB", "DATE_NAISSANCE",
|
||||||
|
"force_term", "force_regex"}:
|
||||||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||||||
for kind, values in _global_pii.items():
|
for kind, values in _global_pii.items():
|
||||||
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
||||||
@@ -1958,7 +1959,7 @@ def process_pdf(
|
|||||||
anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder))
|
anon.audit.append(PiiHit(page=-1, kind=f"{kind}_GLOBAL", original=val, placeholder=placeholder))
|
||||||
|
|
||||||
# 4e) Appliquer les tokens globaux sur le texte pseudonymisé
|
# 4e) Appliquer les tokens globaux sur le texte pseudonymisé
|
||||||
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL", "DATE_NAISSANCE_GLOBAL"}
|
_GLOBAL_SKIP_KINDS = {"EDS_DATE_GLOBAL"}
|
||||||
for h in anon.audit:
|
for h in anon.audit:
|
||||||
if h.page != -1:
|
if h.page != -1:
|
||||||
continue
|
continue
|
||||||
|
|||||||
@@ -13,13 +13,16 @@ whitelist:
|
|||||||
noms_maj_excepts:
|
noms_maj_excepts:
|
||||||
- Médecin DIM
|
- Médecin DIM
|
||||||
- Praticien conseil
|
- Praticien conseil
|
||||||
org_gpe_keep: true
|
org_gpe_keep: false
|
||||||
blacklist:
|
blacklist:
|
||||||
force_mask_terms:
|
force_mask_terms:
|
||||||
- CENTRE HOSPITALIER COTE BASQUE
|
- CENTRE HOSPITALIER COTE BASQUE
|
||||||
|
- CENTRE HOSPITALIER DE LA COTE BASQUE
|
||||||
|
- CHCB
|
||||||
- 'Dates du séjour :'
|
- 'Dates du séjour :'
|
||||||
- CONCERTATION
|
- CONCERTATION
|
||||||
force_mask_regex: []
|
force_mask_regex:
|
||||||
|
- 'Centre\s+Hospitalier\s+(?:de\s+(?:la\s+)?)?C[oôÔ]te\s+Basque'
|
||||||
kv_labels_preserve:
|
kv_labels_preserve:
|
||||||
- FINESS
|
- FINESS
|
||||||
- IPP
|
- IPP
|
||||||
|
|||||||
Reference in New Issue
Block a user