Qualité anonymisation : 10 améliorations (audit 59 OGC, +98% établissements, 0 FP médical)
- RE_ETABLISSEMENT élargi (CH/CHU/CHRU/CHS/HIA/CLCC/GHT/GCS), CH/CHS exigent un nom après - RE_HOPITAL_VILLE enrichi (Centre de Soins, Maison de Santé/Retraite, Résidence, Foyer) - Nouveau RE_SERVICE (service/unité/pôle/département + nom propre) - org_gpe_keep=False : NER masque désormais ORG/LOC - +40 stop words (oncologie, confrères, préparations, spécialités médicales...) - RE_IBAN accepte espaces (groupes de 4, format standard) - RE_TEL tiret échappé + nouveau RE_TEL_COMPACT (numéros collés 0612345678) - RE_ADRESSE +10 types de voies (lotissement, hameau, esplanade, côte...) - RE_AGE élargi (patiente 72 ans, , 88 ans, (85A)) - Blacklist companion tokens (27 mots génériques/spécialités médicales) - Propagation globale VLM_SERVICE et VLM_ETAB Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -98,7 +98,7 @@ DEFAULTS_CFG = {
|
|||||||
"whitelist": {
|
"whitelist": {
|
||||||
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
"sections_titres": ["DIM", "GHM", "GHS", "RUM", "COMPTE", "RENDU", "DIAGNOSTIC"],
|
||||||
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
"noms_maj_excepts": ["Médecin DIM", "Praticien conseil"],
|
||||||
"org_gpe_keep": True,
|
"org_gpe_keep": False,
|
||||||
},
|
},
|
||||||
"blacklist": {
|
"blacklist": {
|
||||||
"force_mask_terms": [],
|
"force_mask_terms": [],
|
||||||
@@ -147,8 +147,9 @@ CRITICAL_PII_KEYS = {"EMAIL", "TEL", "IBAN", "NIR", "IPP", "DATE_NAISSANCE"}
|
|||||||
|
|
||||||
# Baseline regex
|
# Baseline regex
|
||||||
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
RE_EMAIL = re.compile(r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}")
|
||||||
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[ .-]?\d){8}(?!\d)")
|
RE_TEL = re.compile(r"(?<!\d)(?:\+33\s?|0)\d(?:[\s.\-]?\d){8}(?!\d)")
|
||||||
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}[A-Z0-9]{11,30}\b")
|
RE_TEL_COMPACT = re.compile(r"(?<!\d)0[1-9]\d{8}(?!\d)")
|
||||||
|
RE_IBAN = re.compile(r"\b[A-Z]{2}\d{2}(?:\s?[A-Z0-9]{4}){3,7}(?:\s?[A-Z0-9]{1,4})\b")
|
||||||
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
RE_IPP = re.compile(r"\bIPP\s*[:\-]?\s*([A-Za-z0-9]{6,})\b", re.IGNORECASE)
|
||||||
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
RE_FINESS = re.compile(r"\bFINESS\s*[:\-]?\s*(\d{9})\b", re.IGNORECASE)
|
||||||
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
RE_OGC = re.compile(r"\b(?:N°\s*)?OGC\s*[:\-]?\s*([A-Za-z0-9\-]{1,})\b", re.IGNORECASE)
|
||||||
@@ -356,6 +357,26 @@ _MEDICAL_STOP_WORDS_SET = {
|
|||||||
"paracetamol", "paracétamol", "unité", "unite",
|
"paracetamol", "paracétamol", "unité", "unite",
|
||||||
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
|
# FP résiduels batch 10 OGC (termes médicaux/instructions soins)
|
||||||
"glyc", "glycosurie", "vider", "forte",
|
"glyc", "glycosurie", "vider", "forte",
|
||||||
|
# FP audit batch 59 OGC (mots courants/médicaux flagués comme NOM)
|
||||||
|
"oncologie", "confrères", "confrere", "doubles", "chers", "motif",
|
||||||
|
"responsable", "autre", "autres", "autonome", "autonomes",
|
||||||
|
"préparations", "preparations", "prévenir", "prevenir",
|
||||||
|
"acétylsalicylique", "acetylsalicylique", "angio",
|
||||||
|
"desc", "diu", "cambo", "bains", "dogue", "barreau",
|
||||||
|
"haitz", "alde",
|
||||||
|
# Spécialités/services récurrents comme FP NOM
|
||||||
|
"cancérologie", "cancerologie", "réanimation", "reanimation",
|
||||||
|
"urologie", "néphrologie", "nephrologie", "hématologie", "hematologie",
|
||||||
|
"gériatrie", "geriatrie", "pédiatrie", "pediatrie",
|
||||||
|
"ophtalmologie", "stomatologie", "allergologie",
|
||||||
|
"kinésithérapie", "kinesitherapie", "ergothérapie", "ergotherapie",
|
||||||
|
"orthopédie", "orthopedie", "traumatologie",
|
||||||
|
"palliatifs", "palliative", "palliatif",
|
||||||
|
"addictologie", "alcoologie", "tabacologie",
|
||||||
|
# Termes structurels trackare
|
||||||
|
"transmissions", "transmission", "releve", "relevé",
|
||||||
|
"objectif", "objectifs", "evaluation", "évaluation",
|
||||||
|
"planification", "planifié", "planifiee",
|
||||||
}
|
}
|
||||||
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
# Enrichissement automatique avec les ~4000 noms de médicaments d'edsnlp
|
||||||
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
_MEDICAL_STOP_WORDS_SET.update(_load_edsnlp_drug_names())
|
||||||
@@ -467,7 +488,8 @@ RE_DATE = re.compile(
|
|||||||
)
|
)
|
||||||
RE_ADRESSE = re.compile(
|
RE_ADRESSE = re.compile(
|
||||||
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
|
r"\b\d{1,4}[\s,]*(?:bis|ter)?\s*,?\s*"
|
||||||
r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence)"
|
r"(?:rue|avenue|av\.?|boulevard|bd\.?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence"
|
||||||
|
r"|lotissement|lot\.?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte)"
|
||||||
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
|
r"\s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç\s\-']{2,}",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
@@ -485,18 +507,37 @@ RE_BP = re.compile(
|
|||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
RE_AGE = re.compile(
|
RE_AGE = re.compile(
|
||||||
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+de\s+)(\d{1,3})\s*ans\b",
|
r"(?:âg[ée]+\s+de\s+|patient(?:e)?\s+(?:de\s+)?|(?:,\s*|\(\s*)"
|
||||||
|
r")(\d{1,3})\s*(?:ans|A)\b",
|
||||||
re.IGNORECASE,
|
re.IGNORECASE,
|
||||||
)
|
)
|
||||||
# Établissements de santé : avec nom (EHPAD Bayonne) ou seuls (EHPAD, SSR, USLD)
|
# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
|
||||||
|
_ETAB_NAME = (r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||||
|
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)")
|
||||||
RE_ETABLISSEMENT = re.compile(
|
RE_ETABLISSEMENT = re.compile(
|
||||||
r"\b((?:EHPAD|SSR|USLD|HAD|SSR/USLD|CSAPA|CMP|CMPP|UGA)"
|
r"\b("
|
||||||
r"(?:\s+(?:de\s+|d['']\s*)?[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
# Sigles longs : acceptés seuls ou avec nom
|
||||||
|
r"(?:EHPAD|SSR/USLD|SSR|USLD|HAD|CSAPA|CMPP|CMP|UGA|CHRU|CHU|HIA|CLCC|GHT|GCS)"
|
||||||
|
+ _ETAB_NAME + r"*"
|
||||||
|
r"|"
|
||||||
|
# Sigles courts (CH, CHS) : obligent un nom après pour éviter les faux positifs
|
||||||
|
r"(?:CHS|CH)" + _ETAB_NAME + r"+"
|
||||||
|
r")",
|
||||||
)
|
)
|
||||||
RE_HOPITAL_VILLE = re.compile(
|
RE_HOPITAL_VILLE = re.compile(
|
||||||
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier)"
|
r"\b((?:[Hh]ôpital|[Cc]linique|[Pp]olyclinique|[Cc]entre\s+[Hh]ospitalier"
|
||||||
r"\s+(?:de\s+|d['']\s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
r"|[Cc]entre\s+[Dd]e\s+[Ss]oins|[Mm]aison\s+[Dd]e\s+[Ss]anté"
|
||||||
r"(?:\s+(?:de\s+|d['']\s*)?[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
r"|[Mm]aison\s+[Dd]e\s+[Rr]etraite|[Rr]ésidence|[Ff]oyer)"
|
||||||
|
r"\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||||
|
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||||||
|
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||||
|
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||||||
|
)
|
||||||
|
RE_SERVICE = re.compile(
|
||||||
|
r"\b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement)\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||||
|
r"(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)"
|
||||||
|
r"(?:\s+(?:de\s+|d['']\s*|du\s+|des\s+)?"
|
||||||
|
r"[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-']+)*)",
|
||||||
)
|
)
|
||||||
RE_NUMERO_DOSSIER = re.compile(
|
RE_NUMERO_DOSSIER = re.compile(
|
||||||
r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
r"(?:dossier|n°\s*dossier|NDA)\s*[:\-n°]+\s*([A-Za-z0-9\-/]{4,})"
|
||||||
@@ -708,6 +749,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
|||||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||||
return PLACEHOLDERS["TEL"]
|
return PLACEHOLDERS["TEL"]
|
||||||
line = RE_TEL.sub(_repl_tel, line)
|
line = RE_TEL.sub(_repl_tel, line)
|
||||||
|
line = RE_TEL_COMPACT.sub(_repl_tel, line)
|
||||||
|
|
||||||
# IBAN
|
# IBAN
|
||||||
def _repl_iban(m: re.Match) -> str:
|
def _repl_iban(m: re.Match) -> str:
|
||||||
@@ -779,6 +821,12 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
|
|||||||
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
line = RE_ETABLISSEMENT.sub(_repl_etab, line)
|
||||||
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
line = RE_HOPITAL_VILLE.sub(_repl_etab, line)
|
||||||
|
|
||||||
|
# Services hospitaliers (service de Cardiologie, unité de soins palliatifs, etc.)
|
||||||
|
def _repl_service(m: re.Match) -> str:
|
||||||
|
audit.append(PiiHit(page_idx, "ETAB", m.group(0), PLACEHOLDERS["MASK"]))
|
||||||
|
return PLACEHOLDERS["MASK"]
|
||||||
|
line = RE_SERVICE.sub(_repl_service, line)
|
||||||
|
|
||||||
# Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
|
# Champs structurés : Lieu de naissance, Ville de résidence (masquage direct, sans filtre stop words)
|
||||||
_re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
|
_re_lieu = re.compile(r"(Lieu\s+de\s+naissance\s*:\s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç\-\' ]+)")
|
||||||
def _repl_lieu(m: re.Match) -> str:
|
def _repl_lieu(m: re.Match) -> str:
|
||||||
@@ -852,6 +900,7 @@ def _mask_critical_in_key(key: str, audit: List[PiiHit], page_idx: int) -> str:
|
|||||||
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
audit.append(PiiHit(page_idx, "TEL", m.group(0), PLACEHOLDERS["TEL"]))
|
||||||
return PLACEHOLDERS["TEL"]
|
return PLACEHOLDERS["TEL"]
|
||||||
key = RE_TEL.sub(_repl_tel, key)
|
key = RE_TEL.sub(_repl_tel, key)
|
||||||
|
key = RE_TEL_COMPACT.sub(_repl_tel, key)
|
||||||
def _repl_email(m: re.Match) -> str:
|
def _repl_email(m: re.Match) -> str:
|
||||||
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
audit.append(PiiHit(page_idx, "EMAIL", m.group(0), PLACEHOLDERS["EMAIL"]))
|
||||||
return PLACEHOLDERS["EMAIL"]
|
return PLACEHOLDERS["EMAIL"]
|
||||||
@@ -1193,7 +1242,7 @@ def anonymise_document_regex(pages_text: List[str], tables_lines: List[List[str]
|
|||||||
|
|
||||||
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
def _mask_with_hf(text: str, ents: List[Dict[str, Any]], cfg: Dict[str, Any], audit: List[PiiHit]) -> str:
|
||||||
# remplace via regex sur les 'word' détectés (approche pragmatique)
|
# remplace via regex sur les 'word' détectés (approche pragmatique)
|
||||||
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", True))
|
keep_org_gpe = bool((cfg.get("whitelist", {}) or {}).get("org_gpe_keep", False))
|
||||||
def repl_once(s: str, old: str, new: str) -> str:
|
def repl_once(s: str, old: str, new: str) -> str:
|
||||||
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
return re.sub(rf"\b{re.escape(old)}\b", new, s)
|
||||||
out = text
|
out = text
|
||||||
@@ -1364,6 +1413,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
|||||||
# PII critiques (comme avant)
|
# PII critiques (comme avant)
|
||||||
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
protected = RE_EMAIL.sub(PLACEHOLDERS["EMAIL"], protected)
|
||||||
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
protected = RE_TEL.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
|
protected = RE_TEL_COMPACT.sub(PLACEHOLDERS["TEL"], protected)
|
||||||
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
protected = RE_IBAN.sub(PLACEHOLDERS["IBAN"], protected)
|
||||||
# NIR avec validation
|
# NIR avec validation
|
||||||
def _rescan_nir(m: re.Match) -> str:
|
def _rescan_nir(m: re.Match) -> str:
|
||||||
@@ -1382,6 +1432,8 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
|
|||||||
# Établissements
|
# Établissements
|
||||||
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
protected = RE_ETABLISSEMENT.sub(PLACEHOLDERS["ETAB"], protected)
|
||||||
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
protected = RE_HOPITAL_VILLE.sub(PLACEHOLDERS["ETAB"], protected)
|
||||||
|
# Services hospitaliers
|
||||||
|
protected = RE_SERVICE.sub(PLACEHOLDERS["MASK"], protected)
|
||||||
# Personnes contextuelles (avec whitelist)
|
# Personnes contextuelles (avec whitelist)
|
||||||
wl_sections = set()
|
wl_sections = set()
|
||||||
wl_phrases = set()
|
wl_phrases = set()
|
||||||
@@ -1832,18 +1884,33 @@ def process_pdf(
|
|||||||
_global_name_tokens.add(word)
|
_global_name_tokens.add(word)
|
||||||
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
|
# 4a-bis) Noms compagnons : si un token connu est suivi/précédé d'un mot majuscule inconnu
|
||||||
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
|
# dans le texte brut, c'est aussi un nom (ex: "Diego OLIVER" → OLIVER est un nom)
|
||||||
|
_COMPANION_BLACKLIST = {
|
||||||
|
"ZONE", "PARTI", "PLAN", "MAIN", "FORT", "FORTE", "BILAN",
|
||||||
|
"MISE", "NOTE", "AIDE", "BASE", "FACE", "DOSE", "TIGE",
|
||||||
|
"VOIE", "ONDE", "SOIN", "DEMI", "MODE", "CURE", "PAGE",
|
||||||
|
# Spécialités/services
|
||||||
|
"CANCEROLOGIE", "ONCOLOGIE", "REANIMATION", "RADIOLOGIE",
|
||||||
|
"CARDIOLOGIE", "NEUROLOGIE", "PNEUMOLOGIE", "UROLOGIE",
|
||||||
|
"GERIATRIE", "PEDIATRIE", "NEPHROLOGIE", "HEMATOLOGIE",
|
||||||
|
"OPHTALMOLOGIE", "STOMATOLOGIE", "ALLERGOLOGIE",
|
||||||
|
"RHUMATOLOGIE", "DERMATOLOGIE", "IMMUNOLOGIE",
|
||||||
|
}
|
||||||
raw_full = "\n\n".join(pages_text)
|
raw_full = "\n\n".join(pages_text)
|
||||||
_companion_tokens: set = set()
|
_companion_tokens: set = set()
|
||||||
for token in _global_name_tokens:
|
for token in _global_name_tokens:
|
||||||
# Token connu suivi d'un mot ALL-CAPS
|
# Token connu suivi d'un mot ALL-CAPS
|
||||||
for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\b", raw_full):
|
for m in re.finditer(rf"\b{re.escape(token)}\s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\b", raw_full):
|
||||||
candidate = m.group(1)
|
candidate = m.group(1)
|
||||||
if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens:
|
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||||||
|
and candidate not in _global_name_tokens
|
||||||
|
and candidate not in _COMPANION_BLACKLIST):
|
||||||
_companion_tokens.add(candidate)
|
_companion_tokens.add(candidate)
|
||||||
# Mot ALL-CAPS suivi du token connu
|
# Mot ALL-CAPS suivi du token connu
|
||||||
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\s+{re.escape(token)}\b", raw_full):
|
for m in re.finditer(rf"\b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]{{3,}})\s+{re.escape(token)}\b", raw_full):
|
||||||
candidate = m.group(1)
|
candidate = m.group(1)
|
||||||
if candidate.lower() not in _MEDICAL_STOP_WORDS_SET and candidate not in _global_name_tokens:
|
if (candidate.lower() not in _MEDICAL_STOP_WORDS_SET
|
||||||
|
and candidate not in _global_name_tokens
|
||||||
|
and candidate not in _COMPANION_BLACKLIST):
|
||||||
_companion_tokens.add(candidate)
|
_companion_tokens.add(candidate)
|
||||||
_global_name_tokens.update(_companion_tokens)
|
_global_name_tokens.update(_companion_tokens)
|
||||||
|
|
||||||
@@ -1882,7 +1949,8 @@ def process_pdf(
|
|||||||
# 4b) TEL, EMAIL, ADRESSE, CODE_POSTAL : propager les valeurs uniques sur toutes les pages
|
# 4b) TEL, EMAIL, ADRESSE, CODE_POSTAL : propager les valeurs uniques sur toutes les pages
|
||||||
_global_pii: Dict[str, set] = {}
|
_global_pii: Dict[str, set] = {}
|
||||||
for h in anon.audit:
|
for h in anon.audit:
|
||||||
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB"}:
|
if h.kind in {"TEL", "EMAIL", "ADRESSE", "CODE_POSTAL", "EPISODE", "RPPS", "VILLE", "ETAB",
|
||||||
|
"VLM_SERVICE", "VLM_ETAB"}:
|
||||||
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
_global_pii.setdefault(h.kind, set()).add(h.original.strip())
|
||||||
for kind, values in _global_pii.items():
|
for kind, values in _global_pii.items():
|
||||||
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
placeholder = PLACEHOLDERS.get(kind, PLACEHOLDERS["MASK"])
|
||||||
|
|||||||
Reference in New Issue
Block a user