@@ -376,6 +376,7 @@ PLACEHOLDERS = {
" NDA " : " [NDA] " ,
" NDA " : " [NDA] " ,
" EPISODE " : " [EPISODE] " ,
" EPISODE " : " [EPISODE] " ,
" RPPS " : " [RPPS] " ,
" RPPS " : " [RPPS] " ,
" ADHERENT " : " [ADHERENT] " ,
}
}
CRITICAL_PII_KEYS = { " EMAIL " , " TEL " , " IBAN " , " NIR " , " IPP " , " DATE_NAISSANCE " }
CRITICAL_PII_KEYS = { " EMAIL " , " TEL " , " IBAN " , " NIR " , " IPP " , " DATE_NAISSANCE " }
@@ -391,6 +392,10 @@ RE_CSULT = re.compile(r"\b(?:N°\s*Csult|N°\s*Interv)\s*[:\-]?\s*(\d{6,})\b", r
RE_FINESS = re . compile ( r " \ b(?:N° \ s*)?FINESS? \ s*[: \ -]? \ s*( \ d {9} ) \ b " , re . IGNORECASE )
RE_FINESS = re . compile ( r " \ b(?:N° \ s*)?FINESS? \ s*[: \ -]? \ s*( \ d {9} ) \ b " , re . IGNORECASE )
RE_OGC = re . compile ( r " \ b(?:N° \ s*)?OGC \ s*[: \ -]? \ s*([A-Za-z0-9 \ -] { 1,}) \ b " , re . IGNORECASE )
RE_OGC = re . compile ( r " \ b(?:N° \ s*)?OGC \ s*[: \ -]? \ s*([A-Za-z0-9 \ -] { 1,}) \ b " , re . IGNORECASE )
RE_RPPS = re . compile ( r " \ b(?:N° \ s*)?RPPS \ s*[: \ -]? \ s*( \ d { 8,11}) \ b " , re . IGNORECASE )
RE_RPPS = re . compile ( r " \ b(?:N° \ s*)?RPPS \ s*[: \ -]? \ s*( \ d { 8,11}) \ b " , re . IGNORECASE )
RE_NUM_ADHERENT = re . compile (
r " \ b(?:n[°o]? \ s*|num[ée]ro \ s+(?:d[ ' ’ ]\ s*)?)adh[ée]rent[e]? \ s*[: \ -]? \ s*([A-Z0-9] { 6,15}) \ b " ,
re . IGNORECASE ,
)
RE_NIR = re . compile (
RE_NIR = re . compile (
r " \ b([12]) \ s*( \ d {2} ) \ s*(0[1-9]|1[0-2]|2[AB]) \ s*( \ d { 2,3}) \ s*( \ d {3} ) \ s*( \ d {3} ) \ s*( \ d {2} ) \ b " ,
r " \ b([12]) \ s*( \ d {2} ) \ s*(0[1-9]|1[0-2]|2[AB]) \ s*( \ d { 2,3}) \ s*( \ d {3} ) \ s*( \ d {3} ) \ s*( \ d {2} ) \ b " ,
re . IGNORECASE ,
re . IGNORECASE ,
@@ -457,7 +462,7 @@ def _refresh_medical_stopwords_pattern() -> None:
_refresh_medical_stopwords_pattern ( )
_refresh_medical_stopwords_pattern ( )
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
# Un token de nom : commence par majuscule, lettres/tirets/apostrophes (PAS d'espace ni de point)
_PERSON_TOKEN = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+ "
_PERSON_TOKEN = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+ "
RE_PERSON_CONTEXT = re . compile (
RE_PERSON_CONTEXT = re . compile (
r " (?:(?: \ bDr \ .?| \ bDR \ .?| \ bDocteur| \ bPr \ .?| \ bProfesseur| \ bMme| \ bMME| \ bMadame| \ bM \ .| \ bMr \ .?| \ bMonsieur "
r " (?:(?: \ bDr \ .?| \ bDR \ .?| \ bDocteur| \ bPr \ .?| \ bProfesseur| \ bMme| \ bMME| \ bMadame| \ bM \ .| \ bMr \ .?| \ bMonsieur "
r " | \ bNom[ \ t]*:[ \ t]* "
r " | \ bNom[ \ t]*:[ \ t]* "
@@ -469,16 +474,16 @@ RE_PERSON_CONTEXT = re.compile(
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
# Noms en MAJUSCULES dans des listes virgulées (ex: "le Dr X, Y, LAZARO")
RE_DR_COMMA_LIST = re . compile (
RE_DR_COMMA_LIST = re . compile (
r " (?:Dr \ .?|DR \ .?|Docteur) \ s+ "
r " (?:Dr \ .?|DR \ .?|Docteur) \ s+ "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' .]+ "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' .]+ "
r " (?: \ s*, \ s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' .]+)+ " ,
r " (?: \ s*, \ s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' .]+)+ " ,
re . IGNORECASE ,
re . IGNORECASE ,
)
)
# Token nom : mot commençant par une majuscule d'au moins 3 lettres
# Token nom : mot commençant par une majuscule d'au moins 3 lettres
_NAME_TOKEN_RE = re . compile ( r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ] { 2,} " )
_NAME_TOKEN_RE = re . compile ( r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ] { 2,} " )
SPLITTER = re . compile ( r " \ s*[:|; \ t] \ s* " )
SPLITTER = re . compile ( r " \ s*[:|; \ t] \ s* " )
# --- Extraction globale de noms depuis champs structurés ---
# --- Extraction globale de noms depuis champs structurés ---
_UC_NAME_TOKEN = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+ "
_UC_NAME_TOKEN = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+ "
RE_EXTRACT_PATIENT = re . compile (
RE_EXTRACT_PATIENT = re . compile (
r " Patient \ (?e? \ )? \ s*: \ s* "
r " Patient \ (?e? \ )? \ s*: \ s* "
rf " ((?: { _UC_NAME_TOKEN } )(?: \ s+(?: { _UC_NAME_TOKEN } ))*) "
rf " ((?: { _UC_NAME_TOKEN } )(?: \ s+(?: { _UC_NAME_TOKEN } ))*) "
@@ -488,53 +493,53 @@ RE_EXTRACT_PATIENT = re.compile(
# Champs d'identité structurés (documents trackare / DPI)
# Champs d'identité structurés (documents trackare / DPI)
RE_EXTRACT_NOM_NAISSANCE = re . compile (
RE_EXTRACT_NOM_NAISSANCE = re . compile (
r " Nom \ s+de \ s+naissance \ s*: \ s* "
r " Nom \ s+de \ s+naissance \ s*: \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+?)(?: \ s+IPP| \ s*$) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+?)(?: \ s+IPP| \ s*$) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
RE_EXTRACT_NOM_PRENOM = re . compile (
RE_EXTRACT_NOM_PRENOM = re . compile (
r " Nom \ s+et \ s+Pr[ée]nom \ s*: \ s* "
r " Nom \ s+et \ s+Pr[ée]nom \ s*: \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+?)(?: \ s+Date| \ s+Né| \ s*$) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+?)(?: \ s+Date| \ s+Né| \ s*$) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
RE_EXTRACT_NOM_UTILISE = re . compile (
RE_EXTRACT_NOM_UTILISE = re . compile (
r " Nom \ s+utilis[ée] \ s*: \ s* "
r " Nom \ s+utilis[ée] \ s*: \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+?)(?: \ s*$) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+?)(?: \ s*$) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
RE_EXTRACT_PRENOM = re . compile (
RE_EXTRACT_PRENOM = re . compile (
r " Pr[ée]nom \ s+(?:de \ s+naissance|utilis[ée]) \ s*: \ s* "
r " Pr[ée]nom \ s+(?:de \ s+naissance|utilis[ée]) \ s*: \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+?)(?: \ s*$) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+?)(?: \ s*$) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
RE_EXTRACT_LIEU_NAISSANCE = re . compile (
RE_EXTRACT_LIEU_NAISSANCE = re . compile (
r " Lieu \ s+de \ s+naissance \ s*: \ s* "
r " Lieu \ s+de \ s+naissance \ s*: \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+?)(?: \ s*$) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+?)(?: \ s*$) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
RE_EXTRACT_VILLE_RESIDENCE = re . compile (
RE_EXTRACT_VILLE_RESIDENCE = re . compile (
r " Ville \ s+de \ s+r[ée]sidence \ s*: \ s* "
r " Ville \ s+de \ s+r[ée]sidence \ s*: \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+?)(?: \ s*$) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+?)(?: \ s*$) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
# Contacts structurés : Conjoint/Concubin/Epoux/Epouse/Parent + NOM PRENOM
# Contacts structurés : Conjoint/Concubin/Epoux/Epouse/Parent + NOM PRENOM
RE_EXTRACT_CONTACT = re . compile (
RE_EXTRACT_CONTACT = re . compile (
r " (?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur) \ s+ "
r " (?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur) \ s+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+) "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+) "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+))? " ,
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+))? " ,
)
)
RE_EXTRACT_REDIGE = re . compile (
RE_EXTRACT_REDIGE = re . compile (
r " (?:Rédigé|Validé|Signé|Saisi)[ \ t]+par[ \ t]+ "
r " (?:Rédigé|Validé|Signé|Saisi)[ \ t]+par[ \ t]+ "
rf " ((?: { _UC_NAME_TOKEN } )(?:[ \ t]+(?: { _UC_NAME_TOKEN } )) {{ 0,2 }} ) " ,
rf " ((?: { _UC_NAME_TOKEN } )(?:[ \ t]+(?: { _UC_NAME_TOKEN } )) {{ 0,2 }} ) " ,
)
)
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
# Token nom composé : JEAN-PIERRE, CAZELLES-BOUDIER, etc.
_UC_COMPOUND = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] { 2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] { 2,})* "
_UC_COMPOUND = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] { 2,}(?:-[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] { 2,})* "
RE_EXTRACT_MME_MR = re . compile (
RE_EXTRACT_MME_MR = re . compile (
r " (?:MMES|MME|Mmes|Mme|Madame|Mesdames|Monsieur|Messieurs|Mrs|Mr \ .?) \ s+ "
r " (?:MMES|MME|Mmes|Mme|Madame|Mesdames|Monsieur|Messieurs|Mrs|Mr \ .?) \ s+ "
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] \ . \ s*(?:-? \ s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] \ . \ s*)?)? "
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] \ . \ s*(?:-? \ s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] \ . \ s*)?)? "
rf " ((?: { _UC_NAME_TOKEN } )(?:[ \ t]+(?: { _UC_NAME_TOKEN } )) {{ 0,4 }} ) " ,
rf " ((?: { _UC_NAME_TOKEN } )(?:[ \ t]+(?: { _UC_NAME_TOKEN } )) {{ 0,4 }} ) " ,
)
)
# Listes virgulées après civilité : "Mmes Anorga, Goyenaga, Martinez et Murcy"
# Listes virgulées après civilité : "Mmes Anorga, Goyenaga, Martinez et Murcy"
_CNAME = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ] "
_CNAME = r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ] "
RE_CIVILITE_COMMA_LIST = re . compile (
RE_CIVILITE_COMMA_LIST = re . compile (
r " (?:Mmes|Mme|Mesdames|Mrs|Mr|Messieurs|Monsieur|Madame|Dr \ .?|Docteur) \ s+ "
r " (?:Mmes|Mme|Mesdames|Mrs|Mr|Messieurs|Monsieur|Madame|Dr \ .?|Docteur) \ s+ "
+ _CNAME + r " + "
+ _CNAME + r " + "
@@ -542,7 +547,7 @@ RE_CIVILITE_COMMA_LIST = re.compile(
+ r " (?: \ s*,? \ s* \ bet \ s+ " + _CNAME + r " +)? " ,
+ r " (?: \ s*,? \ s* \ bet \ s+ " + _CNAME + r " +)? " ,
re . IGNORECASE ,
re . IGNORECASE ,
)
)
_INITIAL_OPT = r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] \ . \ s*(?:-? \ s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] \ . \ s*)?)? "
_INITIAL_OPT = r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] \ . \ s*(?:-? \ s*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] \ . \ s*)?)? "
RE_EXTRACT_DR_DEST = re . compile (
RE_EXTRACT_DR_DEST = re . compile (
r " \ b(?:DR \ .?|Dr \ .?|Docteur)[ \ t]+ "
r " \ b(?:DR \ .?|Dr \ .?|Docteur)[ \ t]+ "
+ _INITIAL_OPT +
+ _INITIAL_OPT +
@@ -552,8 +557,8 @@ RE_EXTRACT_DR_DEST = re.compile(
RE_EXTRACT_STAFF_ROLE = re . compile (
RE_EXTRACT_STAFF_ROLE = re . compile (
r " \ b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \ t]+Infirmier "
r " \ b(?:Aide|Infirmière?|IDE|IADE|IBODE|ASH|Cadre[ \ t]+Infirmier "
r " |Prescripteur|Prescrit[ \ t]+par|Exécut[ée][ \ t]+par|Réalisé[ \ t]+par) \ b[ \ t]*:?[ \ t]* "
r " |Prescripteur|Prescrit[ \ t]+par|Exécut[ée][ \ t]+par|Réalisé[ \ t]+par) \ b[ \ t]*:?[ \ t]* "
r " ((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+(?:[ \ t]*-[ \ t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç]+)?[ \ t]+)? "
r " ((?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][a-zéèàùâêîôûäëïöüçñ ]+(?:[ \ t]*-[ \ t]*[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][a-zéèàùâêîôûäëïöüçñ ]+)?[ \ t]+)? "
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] { 2,}[ \ -]?)(?:[ \ t \ -]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] { 2,}) { 0,2}) " ,
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] { 2,}[ \ -]?)(?:[ \ t \ -]+[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] { 2,}) { 0,2}) " ,
)
)
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
# "Pr DUVAL", "Pr. J.-M. DUVAL", "Professeur DUVAL"
RE_EXTRACT_PR = re . compile (
RE_EXTRACT_PR = re . compile (
@@ -589,7 +594,7 @@ CID_PATTERN = re.compile(r"\(cid:\d+\)")
# --- Mr/Mme + initiale isolée (ex: "Mme Z", "Mr R") ---
# --- Mr/Mme + initiale isolée (ex: "Mme Z", "Mr R") ---
RE_CIVILITE_INITIALE = re . compile (
RE_CIVILITE_INITIALE = re . compile (
r " \ b((?:Mme|MME|Madame|Monsieur|Mr \ .?|M \ .) \ s+)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])(?=[ \ s,. \ - \ ) \ ]:;!?]|$) "
r " \ b((?:Mme|MME|Madame|Monsieur|Mr \ .?|M \ .) \ s+)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ])(?=[ \ s,. \ - \ ) \ ]:;!?]|$) "
)
)
# --- N° examen / N° patient imagerie (radiologie) ---
# --- N° examen / N° patient imagerie (radiologie) ---
@@ -611,7 +616,7 @@ RE_NUM_ACCESSION_HEADER = re.compile(
# --- Adresses lieu-dit / maison basque / lotissement ---
# --- Adresses lieu-dit / maison basque / lotissement ---
RE_ADRESSE_LIEU_DIT = re . compile (
RE_ADRESSE_LIEU_DIT = re . compile (
r " \ b(?:MAISON|LOT|LOTISSEMENT|RESIDENCE|RÉSIDENCE|MAS|LIEU[ \ s \ -]DIT|DOMAINE|HAMEAU|QUARTIER) \ s+ "
r " \ b(?:MAISON|LOT|LOTISSEMENT|RESIDENCE|RÉSIDENCE|MAS|LIEU[ \ s \ -]DIT|DOMAINE|HAMEAU|QUARTIER) \ s+ "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ] { 2,} "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ] { 2,} "
r " (?: \ s+ \ d { 1,4})? " ,
r " (?: \ s+ \ d { 1,4})? " ,
re . IGNORECASE ,
re . IGNORECASE ,
)
)
@@ -638,7 +643,7 @@ RE_ADRESSE = re.compile(
r " \ b \ d { 1,4}[ \ s,]*(?:bis|ter)? \ s*,? \ s* "
r " \ b \ d { 1,4}[ \ s,]*(?:bis|ter)? \ s*,? \ s* "
r " (?:rue|avenue|av \ .?|boulevard|bd \ .?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence "
r " (?:rue|avenue|av \ .?|boulevard|bd \ .?|place|chemin|all[ée]e|impasse|route|cours|passage|square|r[ée]sidence "
r " |lotissement|lot \ .?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte) "
r " |lotissement|lot \ .?|cit[ée]|hameau|quartier|voie|parvis|esplanade|promenade|côte) "
r " \ s+[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ s \ - ' ] { 2,} " ,
r " \ s+[A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ s \ - ' ] { 2,} " ,
re . IGNORECASE ,
re . IGNORECASE ,
)
)
RE_CODE_POSTAL = re . compile (
RE_CODE_POSTAL = re . compile (
@@ -646,12 +651,12 @@ RE_CODE_POSTAL = re.compile(
r " | "
r " | "
# 5 chiffres + nom de ville (Title Case ou MAJUSCULES), pas précédé d'un chiffre (évite RPPS)
# 5 chiffres + nom de ville (Title Case ou MAJUSCULES), pas précédé d'un chiffre (évite RPPS)
# Exclure les unités médicales (UI, mg, ml, etc.) via negative lookahead
# Exclure les unités médicales (UI, mg, ml, etc.) via negative lookahead
r " (?:(?<! \ d)( \ d {5} )[ \ t]+(?!UI \ b|mg \ b|ml \ b|µg \ b)[A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+ "
r " (?:(?<! \ d)( \ d {5} )[ \ t]+(?!UI \ b|mg \ b|ml \ b|µg \ b)[A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ]+ "
r " (?:[ \ s \ -][A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû]+)* "
r " (?:[ \ s \ -][A-ZÉÈÀÙ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ]+)* "
r " (?: \ s+CEDEX)?) " ,
r " (?: \ s+CEDEX)?) " ,
)
)
RE_BP = re . compile (
RE_BP = re . compile (
r " (?:[A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû \ . \ -]+ \ s+)?BP \ s+ \ d+ " ,
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ \ . \ -]+ \ s+)?BP \ s+ \ d+ " ,
re . IGNORECASE ,
re . IGNORECASE ,
)
)
RE_AGE = re . compile (
RE_AGE = re . compile (
@@ -661,7 +666,7 @@ RE_AGE = re.compile(
)
)
# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
# Établissements de santé : sigles longs peuvent être seuls, sigles courts (CH/CHS) nécessitent un nom
_ETAB_NAME = ( r " (?: \ s+(?:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)? "
_ETAB_NAME = ( r " (?: \ s+(?:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)? "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+) " )
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+) " )
RE_ETABLISSEMENT = re . compile (
RE_ETABLISSEMENT = re . compile (
r " \ b( "
r " \ b( "
# Sigles longs : acceptés seuls ou avec nom
# Sigles longs : acceptés seuls ou avec nom
@@ -682,15 +687,15 @@ RE_HOPITAL_VILLE = re.compile(
# Déterminants : case-insensitive aussi (de, DE, De… du, DU…).
# Déterminants : case-insensitive aussi (de, DE, De… du, DU…).
r " \ s+(?i:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)?(?i:la \ s+|le \ s+|l[ ' ' ] \ s*|les \ s+)? "
r " \ s+(?i:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)?(?i:la \ s+|le \ s+|l[ ' ' ] \ s*|les \ s+)? "
# Nom propre : toujours commence par une majuscule, queue accepte mélange.
# Nom propre : toujours commence par une majuscule, queue accepte mélange.
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+) "
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+) "
r " (?: \ s+(?i:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)?(?i:la \ s+|le \ s+|l[ ' ' ] \ s*|les \ s+)? "
r " (?: \ s+(?i:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)?(?i:la \ s+|le \ s+|l[ ' ' ] \ s*|les \ s+)? "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+)*) " ,
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+)*) " ,
)
)
RE_SERVICE = re . compile (
RE_SERVICE = re . compile (
r " \ b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement) \ s+(?:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)? "
r " \ b((?:[Ss]ervice|[Uu]nité|[Pp]ôle|[Dd]épartement) \ s+(?:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)? "
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+) "
r " (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+) "
r " (?: \ s+(?:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)? "
r " (?: \ s+(?:de \ s+|d[ ' ' ] \ s*|du \ s+|des \ s+)? "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+)*) " ,
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+)*) " ,
)
)
RE_NUMERO_DOSSIER = re . compile (
RE_NUMERO_DOSSIER = re . compile (
r " (?: \ bdossier| \ bn° \ s*dossier| \ bNDA) \ s*[: \ -n°]+ \ s*([A-Za-z0-9 \ -/] { 4,}) "
r " (?: \ bdossier| \ bn° \ s*dossier| \ bNDA) \ s*[: \ -n°]+ \ s*([A-Za-z0-9 \ -/] { 4,}) "
@@ -1246,6 +1251,18 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
_re_url_www = re . compile ( r " (?<! \ S)www \ .[a-z0-9 \ -]+ \ .(?:fr|com|org|net|eu)(?:/[^ \ s]*)? " , re . IGNORECASE )
_re_url_www = re . compile ( r " (?<! \ S)www \ .[a-z0-9 \ -]+ \ .(?:fr|com|org|net|eu)(?:/[^ \ s]*)? " , re . IGNORECASE )
line = _re_url_www . sub ( _repl_url , line )
line = _re_url_www . sub ( _repl_url , line )
# NIR d'abord (avec validation clé modulo 97), pour éviter qu'un NIR
# au format espacé "2 73 04 65 100 100 88" soit consommé par RE_TEL
# qui matcherait les 10 chiffres centraux. Si la validation échoue,
# le pattern reste intact et TEL reprend la main.
def _repl_nir ( m : re . Match ) - > str :
raw = m . group ( 0 )
if not validate_nir ( raw ) :
return raw # faux positif, on ne masque pas
audit . append ( PiiHit ( page_idx , " NIR " , raw , PLACEHOLDERS [ " NIR " ] ) )
return PLACEHOLDERS [ " NIR " ]
line = RE_NIR . sub ( _repl_nir , line )
# TEL
# TEL
def _repl_tel ( m : re . Match ) - > str :
def _repl_tel ( m : re . Match ) - > str :
audit . append ( PiiHit ( page_idx , " TEL " , m . group ( 0 ) , PLACEHOLDERS [ " TEL " ] ) )
audit . append ( PiiHit ( page_idx , " TEL " , m . group ( 0 ) , PLACEHOLDERS [ " TEL " ] ) )
@@ -1260,15 +1277,6 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
return PLACEHOLDERS [ " IBAN " ]
return PLACEHOLDERS [ " IBAN " ]
line = RE_IBAN . sub ( _repl_iban , line )
line = RE_IBAN . sub ( _repl_iban , line )
# NIR (avec validation clé modulo 97)
def _repl_nir ( m : re . Match ) - > str :
raw = m . group ( 0 )
if not validate_nir ( raw ) :
return raw # faux positif, on ne masque pas
audit . append ( PiiHit ( page_idx , " NIR " , raw , PLACEHOLDERS [ " NIR " ] ) )
return PLACEHOLDERS [ " NIR " ]
line = RE_NIR . sub ( _repl_nir , line )
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
# DATE_NAISSANCE (plus spécifique, avant DATE générique)
def _repl_date_naissance ( m : re . Match ) - > str :
def _repl_date_naissance ( m : re . Match ) - > str :
audit . append ( PiiHit ( page_idx , " DATE_NAISSANCE " , m . group ( 0 ) , PLACEHOLDERS [ " DATE_NAISSANCE " ] ) )
audit . append ( PiiHit ( page_idx , " DATE_NAISSANCE " , m . group ( 0 ) , PLACEHOLDERS [ " DATE_NAISSANCE " ] ) )
@@ -1346,6 +1354,14 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
return full [ : full . find ( val ) ] + PLACEHOLDERS [ " NDA " ]
return full [ : full . find ( val ) ] + PLACEHOLDERS [ " NDA " ]
line = RE_VENUE_SEJOUR . sub ( _repl_venue , line )
line = RE_VENUE_SEJOUR . sub ( _repl_venue , line )
# N° adhérent (mutuelle / complémentaire santé)
def _repl_adherent ( m : re . Match ) - > str :
val = m . group ( 1 )
audit . append ( PiiHit ( page_idx , " ADHERENT " , val , PLACEHOLDERS [ " ADHERENT " ] ) )
full = m . group ( 0 )
return full [ : full . find ( val ) ] + PLACEHOLDERS [ " ADHERENT " ]
line = RE_NUM_ADHERENT . sub ( _repl_adherent , line )
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
# Établissements de santé (EHPAD Bayonne, SSR La Concha, Hôpital de Bayonne, etc.)
def _repl_etab ( m : re . Match ) - > str :
def _repl_etab ( m : re . Match ) - > str :
audit . append ( PiiHit ( page_idx , " ETAB " , m . group ( 0 ) , PLACEHOLDERS [ " ETAB " ] ) )
audit . append ( PiiHit ( page_idx , " ETAB " , m . group ( 0 ) , PLACEHOLDERS [ " ETAB " ] ) )
@@ -1369,11 +1385,11 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
# Stratégie : si un segment contient un mot-clé d'établissement, masquer TOUTE la ligne
# Stratégie : si un segment contient un mot-clé d'établissement, masquer TOUTE la ligne
# espacée (tous les segments contigus) pour éviter de laisser "D E L A C ÔT E B A S Q U E"
# espacée (tous les segments contigus) pour éviter de laisser "D E L A C ÔT E B A S Q U E"
_RE_SPACED_TEXT = re . compile (
_RE_SPACED_TEXT = re . compile (
r ' (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] \ s) { 4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] '
r ' (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] \ s) { 4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] '
)
)
# Pattern plus large : toute la chaîne espacée (lettres séparées par espaces + mots courts)
# Pattern plus large : toute la chaîne espacée (lettres séparées par espaces + mots courts)
_RE_SPACED_FULL_LINE = re . compile (
_RE_SPACED_FULL_LINE = re . compile (
r ' (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ](?: \ s|ÔT|ÉE)) { 3,}[ \ sA-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]* '
r ' (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ](?: \ s|ÔT|ÉE)) { 3,}[ \ sA-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ]* '
)
)
_SPACED_ETAB_KEYWORDS = {
_SPACED_ETAB_KEYWORDS = {
" HOSPITALIER " , " HOSPITALIERE " , " HOSPITALIERES " , " HOSPITALIERS " ,
" HOSPITALIER " , " HOSPITALIERE " , " HOSPITALIERES " , " HOSPITALIERS " ,
@@ -1428,9 +1444,9 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
# Autorise les mots de liaison minuscules (de, du, la, sur, en, lès)
# Autorise les mots de liaison minuscules (de, du, la, sur, en, lès)
_re_ville_date = re . compile (
_re_ville_date = re . compile (
r " ^( \ s*) "
r " ^( \ s*) "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][a-zéèàùâêîôûäëïöüç \ -]+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][a-zéèàùâêîôûäëïöüçñ \ -]+ "
r " (?: \ s+(?:de|du|la|sur|en|lès|les|l[ ' ' ] \ s*)? "
r " (?: \ s+(?:de|du|la|sur|en|lès|les|l[ ' ' ] \ s*)? "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ -]+)*) "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ -]+)*) "
r " ( \ s*, \ s+le \ s+ \ d { 1,2}) " ,
r " ( \ s*, \ s+le \ s+ \ d { 1,2}) " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
@@ -1447,7 +1463,7 @@ def _mask_line_by_regex(line: str, audit: List[PiiHit], page_idx: int, cfg: Dict
return m . group ( 1 ) + PLACEHOLDERS [ " VILLE " ]
return m . group ( 1 ) + PLACEHOLDERS [ " VILLE " ]
line = _re_lieu . sub ( _repl_lieu , line )
line = _re_lieu . sub ( _repl_lieu , line )
_re_ville_res = re . compile ( r " (Ville \ s+de \ s+r[ée]sidence \ s*: \ s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - \ ' ]+) " )
_re_ville_res = re . compile ( r " (Ville \ s+de \ s+r[ée]sidence \ s*: \ s*)([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - \ ' ]+) " )
def _repl_ville_res ( m : re . Match ) - > str :
def _repl_ville_res ( m : re . Match ) - > str :
audit . append ( PiiHit ( page_idx , " VILLE " , m . group ( 2 ) . strip ( ) , PLACEHOLDERS [ " VILLE " ] ) )
audit . append ( PiiHit ( page_idx , " VILLE " , m . group ( 2 ) . strip ( ) , PLACEHOLDERS [ " VILLE " ] ) )
return m . group ( 1 ) + PLACEHOLDERS [ " VILLE " ]
return m . group ( 1 ) + PLACEHOLDERS [ " VILLE " ]
@@ -1576,10 +1592,15 @@ def _mask_structured_line(line: str, audit: List[PiiHit], page_idx: int) -> str:
audit . append ( PiiHit ( page_idx , " NDA " , m . group ( 1 ) , PLACEHOLDERS [ " NDA " ] ) )
audit . append ( PiiHit ( page_idx , " NDA " , m . group ( 1 ) , PLACEHOLDERS [ " NDA " ] ) )
return _replace_captured_value ( m . group ( 0 ) , m . group ( 1 ) , PLACEHOLDERS [ " NDA " ] )
return _replace_captured_value ( m . group ( 0 ) , m . group ( 1 ) , PLACEHOLDERS [ " NDA " ] )
def _repl_adherent ( m : re . Match ) - > str :
audit . append ( PiiHit ( page_idx , " ADHERENT " , m . group ( 1 ) , PLACEHOLDERS [ " ADHERENT " ] ) )
return _replace_captured_value ( m . group ( 0 ) , m . group ( 1 ) , PLACEHOLDERS [ " ADHERENT " ] )
masked = RE_CODE_POSTAL . sub ( _repl_code_postal , line )
masked = RE_CODE_POSTAL . sub ( _repl_code_postal , line )
masked = RE_NUM_EXAMEN_PATIENT . sub ( _repl_num_examen , masked )
masked = RE_NUM_EXAMEN_PATIENT . sub ( _repl_num_examen , masked )
masked = RE_NUMERO_DOSSIER . sub ( _repl_dossier , masked )
masked = RE_NUMERO_DOSSIER . sub ( _repl_dossier , masked )
masked = RE_VENUE_SEJOUR . sub ( _repl_venue , masked )
masked = RE_VENUE_SEJOUR . sub ( _repl_venue , masked )
masked = RE_NUM_ADHERENT . sub ( _repl_adherent , masked )
return masked
return masked
@@ -1681,7 +1702,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
_add_name ( m . group ( 1 ) . strip ( ) , " trackare_nom_prenom " , " high " )
_add_name ( m . group ( 1 ) . strip ( ) , " trackare_nom_prenom " , " high " )
# Prénom de naissance / Prénom utilisé : REGINA
# Prénom de naissance / Prénom utilisé : REGINA
for m in re . finditer ( r " Pr[ée]nom \ s+(?:de \ s+naissance|utilis[ée]) \ s*: \ s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ s \ - ' ]+?)(?: \ s*$) " , full_text , re . MULTILINE ) :
for m in re . finditer ( r " Pr[ée]nom \ s+(?:de \ s+naissance|utilis[ée]) \ s*: \ s*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ s \ - ' ]+?)(?: \ s*$) " , full_text , re . MULTILINE ) :
_add_name ( m . group ( 1 ) . strip ( ) , " trackare_prenom " , " high " )
_add_name ( m . group ( 1 ) . strip ( ) , " trackare_prenom " , " high " )
# Lieu de naissance: BAYONNE, biarritz, 64102, 99999 → masquer comme VILLE
# Lieu de naissance: BAYONNE, biarritz, 64102, 99999 → masquer comme VILLE
@@ -1690,11 +1711,11 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
if val :
if val :
hits . append ( PiiHit ( - 1 , " VILLE " , val , PLACEHOLDERS [ " VILLE " ] ) )
hits . append ( PiiHit ( - 1 , " VILLE " , val , PLACEHOLDERS [ " VILLE " ] ) )
# Ajouter au set names seulement si alphabétique (pas les codes INSEE numériques)
# Ajouter au set names seulement si alphabétique (pas les codes INSEE numériques)
if re . match ( r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç] " , val ) :
if re . match ( r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ ] " , val ) :
names . add ( val )
names . add ( val )
# Ville de résidence: TARNOS → masquer comme VILLE
# Ville de résidence: TARNOS → masquer comme VILLE
for m in re . finditer ( r " Ville \ s+de \ s+r[ée]sidence \ s*: \ s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû \ s \ - ' ]+?)(?: \ s*$) " , full_text , re . MULTILINE ) :
for m in re . finditer ( r " Ville \ s+de \ s+r[ée]sidence \ s*: \ s*([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ \ s \ - ' ]+?)(?: \ s*$) " , full_text , re . MULTILINE ) :
val = m . group ( 1 ) . strip ( )
val = m . group ( 1 ) . strip ( )
hits . append ( PiiHit ( - 1 , " VILLE " , val , PLACEHOLDERS [ " VILLE " ] ) )
hits . append ( PiiHit ( - 1 , " VILLE " , val , PLACEHOLDERS [ " VILLE " ] ) )
names . add ( val )
names . add ( val )
@@ -1736,9 +1757,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# Inclut "Personne à prévenir" + relations + Ami/Voisin/Autre
# Inclut "Personne à prévenir" + relations + Ami/Voisin/Autre
for m in re . finditer (
for m in re . finditer (
r " (?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur|Ami|Amie|Voisin|Voisine|Autre) \ s+ "
r " (?:Conjoint|Concubin|Epoux|Epouse|Parent|Père|Mère|Fils|Fille|Frère|Soeur|Tuteur|Ami|Amie|Voisin|Voisine|Autre) \ s+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä \ - ' ]+) "
r " ([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ä \ - ' ]+) "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä \ - ' ]+))? "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ä \ - ' ]+))? "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä \ - ' ]+))? " ,
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ä \ - ' ]+))? " ,
full_text ,
full_text ,
) :
) :
contact_parts = [ g . strip ( " .- ' (), " ) for g in ( m . group ( 1 ) , m . group ( 2 ) , m . group ( 3 ) ) if g ]
contact_parts = [ g . strip ( " .- ' (), " ) for g in ( m . group ( 1 ) , m . group ( 2 ) , m . group ( 3 ) ) if g ]
@@ -1763,9 +1784,9 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
for m in re . finditer (
for m in re . finditer (
r " [Pp]ersonne \ s+[àa] \ s+pr[ée]venir \ s*[: \ -]? \ s* \ n "
r " [Pp]ersonne \ s+[àa] \ s+pr[ée]venir \ s*[: \ -]? \ s* \ n "
r " (?:[^ \ n] { 0,30} \ n) { 0,2} " # 0-2 lignes intermédiaires (relation, etc.)
r " (?:[^ \ n] { 0,30} \ n) { 0,2} " # 0-2 lignes intermédiaires (relation, etc.)
r " \ s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä \ - ' ]+) "
r " \ s*([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ä \ - ' ]+) "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä \ - ' ]+))? "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ä \ - ' ]+))? "
r " (?: \ s* \ n \ s*([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûä \ - ' ]+))? " ,
r " (?: \ s* \ n \ s*([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ä \ - ' ]+))? " ,
full_text ,
full_text ,
) :
) :
for g in ( m . group ( 1 ) , m . group ( 2 ) , m . group ( 3 ) ) :
for g in ( m . group ( 1 ) , m . group ( 2 ) , m . group ( 3 ) ) :
@@ -1780,8 +1801,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
for m in re . finditer (
for m in re . finditer (
r " (?:Prescripteur|Prescrit \ s+par|Exécut[ée] \ s+par|Réalisé \ s+par) \ s*:? \ s* "
r " (?:Prescripteur|Prescrit \ s+par|Exécut[ée] \ s+par|Réalisé \ s+par) \ s*:? \ s* "
r " (?:(?:Dr|Pr) \ .? \ s+)? "
r " (?:(?:Dr|Pr) \ .? \ s+)? "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+) "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+) "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ - ' ]+))? " ,
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ - ' ]+))? " ,
full_text ,
full_text ,
) :
) :
_add_name ( m . group ( 1 ) , " trackare_prescripteur " , " medium " )
_add_name ( m . group ( 1 ) , " trackare_prescripteur " , " medium " )
@@ -1789,12 +1810,12 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
_add_name ( m . group ( 2 ) , " trackare_prescripteur " , " medium " )
_add_name ( m . group ( 2 ) , " trackare_prescripteur " , " medium " )
# --- Médecins urgences (IAO, prise en charge, décision) (medium context) ---
# --- Médecins urgences (IAO, prise en charge, décision) (medium context) ---
for m in re . finditer ( r " IAO \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ -]+) " , full_text ) :
for m in re . finditer ( r " IAO \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ -]+) " , full_text ) :
_add_name ( m . group ( 1 ) , " trackare_iao " , " medium " )
_add_name ( m . group ( 1 ) , " trackare_iao " , " medium " )
for m in re . finditer (
for m in re . finditer (
r " Médecin \ s+de \ s+la \ s+(?:prise \ s+en \ s+charge|décision) \ s+médicale \ s+ "
r " Médecin \ s+de \ s+la \ s+(?:prise \ s+en \ s+charge|décision) \ s+médicale \ s+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ -]+) "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ -]+) "
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ -]+))? " ,
r " (?: \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ -]+))? " ,
full_text ,
full_text ,
) :
) :
_add_name ( m . group ( 1 ) , " trackare_medecin_urgences " , " medium " )
_add_name ( m . group ( 1 ) , " trackare_medecin_urgences " , " medium " )
@@ -1805,8 +1826,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# Pattern: "Note IDE\nPrenom NOM" ou "Note d'évolution\nPrenom NOM"
# Pattern: "Note IDE\nPrenom NOM" ou "Note d'évolution\nPrenom NOM"
for m in re . finditer (
for m in re . finditer (
r " Note \ s+(?:IDE|AS|d ' [ée]volution|m[ée]dicale|kin[ée]) \ s* \ n \ s* "
r " Note \ s+(?:IDE|AS|d ' [ée]volution|m[ée]dicale|kin[ée]) \ s* \ n \ s* "
r " ([A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôû][a-zéèàùâêîôûäëïöüç]+) \ s+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ ][a-zéèàùâêîôûäëïöüçñ ]+) \ s+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -]+) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -]+) " ,
full_text
full_text
) :
) :
prenom , nom = m . group ( 1 ) , m . group ( 2 )
prenom , nom = m . group ( 1 ) , m . group ( 2 )
@@ -1819,7 +1840,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# --- Noms soignants multi-lignes : "Prénom\nNOM" dans les tableaux de prescriptions/soins (low context) ---
# --- Noms soignants multi-lignes : "Prénom\nNOM" dans les tableaux de prescriptions/soins (low context) ---
for m in re . finditer (
for m in re . finditer (
r ' \ b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç] { 3,}) \ s* \ n \ s*([A-ZÉÈÀÙÂÊÎÔÛ] { 4,}) \ b ' ,
r ' \ b([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ ] { 3,}) \ s* \ n \ s*([A-ZÉÈÀÙÂÊÎÔÛ] { 4,}) \ b ' ,
full_text
full_text
) :
) :
prenom , nom = m . group ( 1 ) , m . group ( 2 )
prenom , nom = m . group ( 1 ) , m . group ( 2 )
@@ -1833,8 +1854,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
for m in re . finditer (
for m in re . finditer (
r " Note[ \ t]+(?:IDE|AS|d ' [ée]volution|m[ée]dicale|kin[ée])[ \ t]+ "
r " Note[ \ t]+(?:IDE|AS|d ' [ée]volution|m[ée]dicale|kin[ée])[ \ t]+ "
r " (?:DR \ .?[ \ t]+)? "
r " (?:DR \ .?[ \ t]+)? "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -]+) "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -]+) "
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -]+))? " ,
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -]+))? " ,
full_text
full_text
) :
) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
@@ -1846,8 +1867,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") (low context) ---
# --- "Signé" suivi directement d'un nom de soignant (ex: "Signé LARRIEU-") (low context) ---
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
# IMPORTANT: [ \t]+ (pas \s+) pour éviter de capturer les médicaments sur la ligne suivante
for m in re . finditer (
for m in re . finditer (
r " Signé[ \ t]+(?!—|par \ b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -]+) "
r " Signé[ \ t]+(?!—|par \ b)([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -]+) "
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -]+))? " ,
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -]+))? " ,
full_text
full_text
) :
) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
@@ -1859,8 +1880,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") (low context) ---
# --- "Signé —" + médicament + nom soignant (ex: "Signé — PARACETAMOL BBM 1000 MG INJ NARZABAL") (low context) ---
for m in re . finditer (
for m in re . finditer (
r " Signé[ \ t]+—[ \ t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \ t]+[-]?[ \ t]* "
r " Signé[ \ t]+—[ \ t]+.*(?:INJ|COMP|GEL|PDR|SOL|PERF|SUSP|CAPS|CREM|SACHET|SIROP)[ \ t]+[-]?[ \ t]* "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -] { 3,}) "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -] { 3,}) "
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç] { 3,}))? " ,
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ ] { 3,}))? " ,
full_text
full_text
) :
) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
@@ -1873,7 +1894,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") (low context) ---
# --- Noms soignants après conditionnement médicament (ex: "Flacon(s) LACROUTS") (low context) ---
for m in re . finditer (
for m in re . finditer (
r " (?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?: \ (s \ ))?[ \ t]+ "
r " (?:Flacon|Ampoule|Seringue|Poche|Comprim[ée]|Gélule|Sachet)(?: \ (s \ ))?[ \ t]+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -] { 3,}) " ,
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -] { 3,}) " ,
full_text
full_text
) :
) :
tok = m . group ( 1 ) . rstrip ( ' - ' )
tok = m . group ( 1 ) . rstrip ( ' - ' )
@@ -1883,8 +1904,8 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions (medium context) ---
# --- "DR." / "DR" suivi d'un prénom seul (ex: "DR. Ute", "DR. Tam") dans les prescriptions (medium context) ---
for m in re . finditer (
for m in re . finditer (
r " DR \ .?[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç] { 3,}) "
r " DR \ .?[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ ] { 3,}) "
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛa-zéèàùâêîôûäëïöüç \ -]+))? " ,
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛÑ a-zéèàùâêîôûñ äëïöüçñ \ -]+))? " ,
full_text
full_text
) :
) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
@@ -1899,7 +1920,7 @@ def _extract_trackare_identity(full_text: str) -> Tuple[set, List[PiiHit], set,
for m in re . finditer (
for m in re . finditer (
r " \ d { 1,2}[ \ t]*:[ \ t]* \ d {2} [ \ t]+ "
r " \ d { 1,2}[ \ t]*:[ \ t]* \ d {2} [ \ t]+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ \ -] { 3,}) "
r " ([A-ZÉÈÀÙÂÊÎÔÛ][A-ZÉÈÀÙÂÊÎÔÛ \ -] { 3,}) "
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüç] { 3,}))? " ,
r " (?:[ \ t]+([A-ZÉÈÀÙÂÊÎÔÛ][a-zéèàùâêîôûäëïöüçñ ] { 3,}))? " ,
full_text
full_text
) :
) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
for g in ( m . group ( 1 ) , m . group ( 2 ) ) :
@@ -2107,7 +2128,7 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
# Dr X.NOM / Pr X.NOM : initiale collée au nom (ex: "Dr E.ELLIE", "Pr J.DUPONT")
# Dr X.NOM / Pr X.NOM : initiale collée au nom (ex: "Dr E.ELLIE", "Pr J.DUPONT")
_RE_DR_INITIAL_DOT_NAME = re . compile (
_RE_DR_INITIAL_DOT_NAME = re . compile (
r " \ b(?:Dr \ .?|Docteur|Pr \ .?|Professeur)[ \ t]+ "
r " \ b(?:Dr \ .?|Docteur|Pr \ .?|Professeur)[ \ t]+ "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]) \ .([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ -] { 2,}) "
r " ([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ]) \ .([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ -] { 2,}) "
)
)
for m in _RE_DR_INITIAL_DOT_NAME . finditer ( full_text ) :
for m in _RE_DR_INITIAL_DOT_NAME . finditer ( full_text ) :
names . add ( m . group ( 2 ) ) # Le nom (ELLIE)
names . add ( m . group ( 2 ) ) # Le nom (ELLIE)
@@ -2135,8 +2156,8 @@ def _extract_document_names(full_text: str, cfg: Dict[str, Any]) -> Tuple[set, s
# INSEE et un nom de famille INSEE. Les tokens proposés viennent
# INSEE et un nom de famille INSEE. Les tokens proposés viennent
# exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici.
# exclusivement des dictionnaires INSEE, sans blacklist codée en dur ici.
_UPPER_NAME_LINE_RE = re . compile (
_UPPER_NAME_LINE_RE = re . compile (
r " ^[ \ t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ \ - ' ]+ "
r " ^[ \ t]*([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ \ - ' ]+ "
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ])[ \ t]*$ " ,
r " [A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ])[ \ t]*$ " ,
re . MULTILINE ,
re . MULTILINE ,
)
)
for m in _UPPER_NAME_LINE_RE . finditer ( full_text ) :
for m in _UPPER_NAME_LINE_RE . finditer ( full_text ) :
@@ -3198,7 +3219,7 @@ def _mask_finess_addresses(text: str, return_matched_names: bool = False):
suffix = text [ end : min ( len ( text ) , end + 60 ) ]
suffix = text [ end : min ( len ( text ) , end + 60 ) ]
# BP/CS + numéro + éventuel code postal + ville
# BP/CS + numéro + éventuel code postal + ville
bp_match = re . match (
bp_match = re . match (
r ' ( \ s*(?:BP|CS) \ s* \ d+ \ s*[,.]? \ s*(?: \ d {5} \ s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇa-zéèàùâêîôûäëïöüç \ s \ -]+(?:CEDEX)?)?) ' ,
r ' ( \ s*(?:BP|CS) \ s* \ d+ \ s*[,.]? \ s*(?: \ d {5} \ s*)?(?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ][A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ a-zéèàùâêîôûäëïöüçñ \ s \ -]+(?:CEDEX)?)?) ' ,
suffix , re . IGNORECASE )
suffix , re . IGNORECASE )
if bp_match :
if bp_match :
ext_end = end + len ( bp_match . group ( 1 ) . rstrip ( ) )
ext_end = end + len ( bp_match . group ( 1 ) . rstrip ( ) )
@@ -3533,14 +3554,15 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
)
)
protected = RE_EMAIL . sub ( PLACEHOLDERS [ " EMAIL " ] , protected )
protected = RE_EMAIL . sub ( PLACEHOLDERS [ " EMAIL " ] , protected )
protected = RE_URL . sub ( PLACEHOLDERS [ " MASK " ] , protected )
protected = RE_URL . sub ( PLACEHOLDERS [ " MASK " ] , protected )
# NIR d'abord (validation modulo 97), pour éviter qu'un NIR au format
# espacé soit consommé par RE_TEL.
def _rescan_nir ( m : re . Match ) - > str :
return PLACEHOLDERS [ " NIR " ] if validate_nir ( m . group ( 0 ) ) else m . group ( 0 )
protected = RE_NIR . sub ( _rescan_nir , protected )
protected = RE_TEL_SLASH . sub ( PLACEHOLDERS [ " TEL " ] , protected )
protected = RE_TEL_SLASH . sub ( PLACEHOLDERS [ " TEL " ] , protected )
protected = RE_TEL . sub ( PLACEHOLDERS [ " TEL " ] , protected )
protected = RE_TEL . sub ( PLACEHOLDERS [ " TEL " ] , protected )
protected = RE_TEL_COMPACT . sub ( PLACEHOLDERS [ " TEL " ] , protected )
protected = RE_TEL_COMPACT . sub ( PLACEHOLDERS [ " TEL " ] , protected )
protected = RE_IBAN . sub ( PLACEHOLDERS [ " IBAN " ] , protected )
protected = RE_IBAN . sub ( PLACEHOLDERS [ " IBAN " ] , protected )
# NIR avec validation
def _rescan_nir ( m : re . Match ) - > str :
return PLACEHOLDERS [ " NIR " ] if validate_nir ( m . group ( 0 ) ) else m . group ( 0 )
protected = RE_NIR . sub ( _rescan_nir , protected )
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
# Nouvelles regex : dates de naissance, dates, adresses, codes postaux
protected = RE_DATE_NAISSANCE . sub ( PLACEHOLDERS [ " DATE_NAISSANCE " ] , protected )
protected = RE_DATE_NAISSANCE . sub ( PLACEHOLDERS [ " DATE_NAISSANCE " ] , protected )
# protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé
# protected = RE_DATE.sub(PLACEHOLDERS["DATE"], protected) # désactivé
@@ -3575,7 +3597,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
# Adresses (gazetteer Aho-Corasick FINESS — 28K noms de voie)
# Adresses (gazetteer Aho-Corasick FINESS — 28K noms de voie)
protected = _mask_finess_addresses ( protected )
protected = _mask_finess_addresses ( protected )
# Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R" → [ETABLISSEMENT]
# Texte espacé d'en-tête : "C E N T R E H O S P I T A L I E R" → [ETABLISSEMENT]
_re_spaced = re . compile ( r ' (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] \ s) { 4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] ' )
_re_spaced = re . compile ( r ' (?:[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] \ s) { 4,}[A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] ' )
_spaced_kw = { " HOSPITALIER " , " HOSPITALIERE " , " HOSPITALIERES " , " HOSPITALIERS " ,
_spaced_kw = { " HOSPITALIER " , " HOSPITALIERE " , " HOSPITALIERES " , " HOSPITALIERS " ,
" CLINIQUE " , " HOPITAL " , " HÔPITAL " , " POLYCLINIQUE " ,
" CLINIQUE " , " HOPITAL " , " HÔPITAL " , " POLYCLINIQUE " ,
" CENTRE " , " ETABLISSEMENT " , " MAISON " , " RESIDENCE " ,
" CENTRE " , " ETABLISSEMENT " , " MAISON " , " RESIDENCE " ,
@@ -3620,7 +3642,7 @@ def selective_rescan(text: str, cfg: Dict[str, Any] | None = None) -> str:
lambda m : m . group ( 1 ) + PLACEHOLDERS [ " NOM " ] , protected
lambda m : m . group ( 1 ) + PLACEHOLDERS [ " NOM " ] , protected
)
)
# Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
# Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
_re_init_nom = re . compile ( r ' \ b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]) \ .[ \ s \ -]*( \ [NOM \ ]) ' )
_re_init_nom = re . compile ( r ' \ b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ]) \ .[ \ s \ -]*( \ [NOM \ ]) ' )
protected = _re_init_nom . sub ( r ' [NOM] \ 2 ' , protected )
protected = _re_init_nom . sub ( r ' [NOM] \ 2 ' , protected )
# Références initiales : "Ref : JF/VA" → "Ref : [NOM]/[NOM]"
# Références initiales : "Ref : JF/VA" → "Ref : [NOM]/[NOM]"
_re_ref_init = re . compile ( r ' (?:Ref \ s*: \ s*|Réf \ s*: \ s*)([A-Z] { 1,3}) \ s*/ \ s*([A-Z] { 1,3}) \ b ' )
_re_ref_init = re . compile ( r ' (?:Ref \ s*: \ s*|Réf \ s*: \ s*)([A-Z] { 1,3}) \ s*/ \ s*([A-Z] { 1,3}) \ b ' )
@@ -4214,7 +4236,7 @@ def process_pdf(
# 3c) Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
# 3c) Initiales identifiantes devant [NOM] : "Dr T. [NOM]" → "Dr [NOM] [NOM]"
_RE_INITIAL_BEFORE_NOM = re . compile (
_RE_INITIAL_BEFORE_NOM = re . compile (
r ' \ b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ]) \ .[ \ s \ -]*( \ [NOM \ ]) '
r ' \ b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ]) \ .[ \ s \ -]*( \ [NOM \ ]) '
)
)
def _clean_initial_before_nom ( m ) :
def _clean_initial_before_nom ( m ) :
anon . audit . append ( PiiHit ( - 1 , " NOM_INITIAL " , m . group ( 1 ) + " . " , PLACEHOLDERS [ " NOM " ] ) )
anon . audit . append ( PiiHit ( - 1 , " NOM_INITIAL " , m . group ( 1 ) + " . " , PLACEHOLDERS [ " NOM " ] ) )
@@ -4269,14 +4291,14 @@ def process_pdf(
_companion_tokens : set = set ( )
_companion_tokens : set = set ( )
for token in _global_name_tokens :
for token in _global_name_tokens :
# Token connu suivi d'un mot ALL-CAPS
# Token connu suivi d'un mot ALL-CAPS
for m in re . finditer ( rf " \ b { re . escape ( token ) } \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] {{ 4, }} ) \ b " , raw_full ) :
for m in re . finditer ( rf " \ b { re . escape ( token ) } \ s+([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] {{ 4, }} ) \ b " , raw_full ) :
candidate = m . group ( 1 )
candidate = m . group ( 1 )
if ( candidate . lower ( ) not in _MEDICAL_STOP_WORDS_SET
if ( candidate . lower ( ) not in _MEDICAL_STOP_WORDS_SET
and candidate not in _global_name_tokens
and candidate not in _global_name_tokens
and candidate not in _COMPANION_BLACKLIST_SET ) :
and candidate not in _COMPANION_BLACKLIST_SET ) :
_companion_tokens . add ( candidate )
_companion_tokens . add ( candidate )
# Mot ALL-CAPS suivi du token connu
# Mot ALL-CAPS suivi du token connu
for m in re . finditer ( rf " \ b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇ] {{ 4, }} ) \ s+ { re . escape ( token ) } \ b " , raw_full ) :
for m in re . finditer ( rf " \ b([A-ZÉÈÀÙÂÊÎÔÛÄËÏÖÜÇÑ ] {{ 4, }} ) \ s+ { re . escape ( token ) } \ b " , raw_full ) :
candidate = m . group ( 1 )
candidate = m . group ( 1 )
if ( candidate . lower ( ) not in _MEDICAL_STOP_WORDS_SET
if ( candidate . lower ( ) not in _MEDICAL_STOP_WORDS_SET
and candidate not in _global_name_tokens
and candidate not in _global_name_tokens